--- /dev/null
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="447.99197"
+ height="428.19299"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.3.1 r9886"
+ sodipodi:docname="GPpartitionReaders1.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend"
+ style="overflow:visible">
+ <path
+ id="path3792"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lstart"
+ style="overflow:visible">
+ <path
+ id="path3789"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(1.1,0,0,1.1,1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="1.6184291"
+ inkscape:cx="223.99599"
+ inkscape:cy="214.0965"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ inkscape:window-width="979"
+ inkscape:window-height="836"
+ inkscape:window-x="571"
+ inkscape:window-y="335"
+ inkscape:window-maximized="0"
+ fit-margin-top="5"
+ fit-margin-left="5"
+ fit-margin-right="5"
+ fit-margin-bottom="5" />
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(-28.441125,-185.60612)">
+ <flowRoot
+ xml:space="preserve"
+ id="flowRoot2985"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ id="flowRegion2987"><rect
+ id="rect2989"
+ width="82.85714"
+ height="11.428572"
+ x="240"
+ y="492.36218" /></flowRegion><flowPara
+ id="flowPara2991"></flowPara></flowRoot> <g
+ id="g4433"
+ transform="translate(2,0)">
+ <text
+ sodipodi:linespacing="125%"
+ id="text2993"
+ y="-261.66608"
+ x="412.12299"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ xml:space="preserve"
+ transform="matrix(0,1,-1,0,0,0)"><tspan
+ y="-261.66608"
+ x="412.12299"
+ id="tspan2995"
+ sodipodi:role="line">synchronize_rcu()</tspan></text>
+ <g
+ id="g4417"
+ transform="matrix(0,1,-1,0,730.90257,222.4928)">
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+ d="m 97.580736,477.4048 183.140664,0"
+ id="path2997"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 96.752718,465.38398 0,22.62742"
+ id="path4397"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 281.54942,465.38397 0,22.62742"
+ id="path4397-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ </g>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.04738"
+ y="268.18076"
+ id="text4429"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431"
+ x="112.04738"
+ y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.04738"
+ y="439.13766"
+ id="text4441"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4443"
+ x="112.04738"
+ y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="255.60869"
+ y="309.29346"
+ id="text4445"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4447"
+ x="255.60869"
+ y="309.29346">r1 = READ_ONCE(a);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="255.14423"
+ y="520.61786"
+ id="text4449"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4451"
+ x="255.14423"
+ y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="384.71124"
+ id="text4453"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4455"
+ x="396.10254"
+ y="384.71124">r2 = READ_ONCE(b);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="582.13617"
+ id="text4457"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4459"
+ x="396.10254"
+ y="582.13617">r3 = READ_ONCE(c);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.08231"
+ y="213.91006"
+ id="text4461"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463"
+ x="112.08231"
+ y="213.91006">thread0()</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="252.34512"
+ y="213.91006"
+ id="text4461-6"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-0"
+ x="252.34512"
+ y="213.91006">thread1()</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.42557"
+ y="213.91006"
+ id="text4461-2"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-2"
+ x="396.42557"
+ y="213.91006">thread2()</tspan></text>
+ <rect
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="rect4495"
+ width="436.28488"
+ height="416.4859"
+ x="34.648232"
+ y="191.10612" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 183.14066,191.10612 0,417.193 -0.70711,0"
+ id="path4497"
+ inkscape:connector-curvature="0" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 325.13867,191.10612 0,417.193 -0.70711,0"
+ id="path4497-5"
+ inkscape:connector-curvature="0" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="111.75929"
+ y="251.53981"
+ id="text4429-8"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9"
+ x="111.75929"
+ y="251.53981">rcu_read_lock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="367.91556"
+ id="text4429-8-9"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4"
+ x="396.10254"
+ y="367.91556">rcu_read_lock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="597.40289"
+ id="text4429-8-9-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-4"
+ x="396.10254"
+ y="597.40289">rcu_read_unlock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="111.75929"
+ y="453.15311"
+ id="text4429-8-9-3-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-4-6"
+ x="111.75929"
+ y="453.15311">rcu_read_unlock();</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 33.941125,227.87568 436.284885,0 0,0.7071"
+ id="path4608"
+ inkscape:connector-curvature="0" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="394.94427"
+ y="345.66351"
+ id="text4648"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650"
+ x="394.94427"
+ y="345.66351">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(36.441125,199.60612)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.11968"
+ y="475.77856"
+ id="text4648-4"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-4"
+ x="112.11968"
+ y="475.77856">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-7"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(-246.38346,329.72117)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <path
+ sodipodi:type="arc"
+ style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-7-7"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(-103.65246,202.90878)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="254.85066"
+ y="348.96619"
+ id="text4648-4-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-4-5"
+ x="254.85066"
+ y="348.96619">QS</tspan></text>
+ </g>
+</svg>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
+
+<!-- CreationDate: Tue Mar 4 18:34:25 2014 -->
+
+<!-- Magnification: 3.000 -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="1089.1382"
+ height="668.21368"
+ viewBox="-2121 -36 14554.634 8876.4061"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.3.1 r9886"
+ sodipodi:docname="RCUApplicability.svg">
+ <metadata
+ id="metadata40">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <defs
+ id="defs38" />
+ <sodipodi:namedview
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1"
+ objecttolerance="10"
+ gridtolerance="10"
+ guidetolerance="10"
+ inkscape:pageopacity="0"
+ inkscape:pageshadow="2"
+ inkscape:window-width="849"
+ inkscape:window-height="639"
+ id="namedview36"
+ showgrid="false"
+ inkscape:zoom="0.51326165"
+ inkscape:cx="544.56912"
+ inkscape:cy="334.10686"
+ inkscape:window-x="149"
+ inkscape:window-y="448"
+ inkscape:window-maximized="0"
+ inkscape:current-layer="g4"
+ fit-margin-top="5"
+ fit-margin-left="5"
+ fit-margin-right="5"
+ fit-margin-bottom="5" />
+ <g
+ style="fill:none;stroke-width:0.025in"
+ id="g4"
+ transform="translate(-2043.6828,14.791398)">
+ <!-- Line: box -->
+ <rect
+ x="0"
+ y="0"
+ width="14400"
+ height="8775"
+ rx="0"
+ style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+ id="rect6" />
+ <!-- Line: box -->
+ <rect
+ x="1350"
+ y="0"
+ width="11700"
+ height="6075"
+ rx="0"
+ style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+ id="rect8" />
+ <!-- Line: box -->
+ <rect
+ x="2700"
+ y="0"
+ width="9000"
+ height="4275"
+ rx="0"
+ style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+ id="rect10" />
+ <!-- Line: box -->
+ <rect
+ x="4050"
+ y="0"
+ width="6300"
+ height="2475"
+ rx="0"
+ style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
+ id="rect12" />
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="900"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text14"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3017">Read-Mostly, Stale &</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="1350"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text16"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3019">Inconsistent Data OK</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="1800"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text18"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3021">(RCU Works Great!!!)</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="3825"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text20"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3023">(RCU Works Well)</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="3375"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text22"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="5175"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text24"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="6975"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text26"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="5625"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text28"
+ sodipodi:linespacing="125%"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ id="tspan3029">(RCU Might Be OK...)</tspan></text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="7875"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text30"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="8325"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text32"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
+ <!-- Text -->
+ <text
+ xml:space="preserve"
+ x="7200"
+ y="7425"
+ font-style="normal"
+ font-weight="normal"
+ font-size="324"
+ id="text34"
+ style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
+ sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
+ </g>
+</svg>
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="735.25"
+ height="516.21875"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.3.1 r9886"
+ sodipodi:docname="ReadersPartitionGP1.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend"
+ style="overflow:visible">
+ <path
+ id="path3792"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lstart"
+ style="overflow:visible">
+ <path
+ id="path3789"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(1.1,0,0,1.1,1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lstart"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lstart-4"
+ style="overflow:visible">
+ <path
+ id="path3789-9"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(1.1,0,0,1.1,1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0"
+ refX="0"
+ id="Arrow2Lend-4"
+ style="overflow:visible">
+ <path
+ id="path3792-4"
+ style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
+ d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
+ transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
+ inkscape:connector-curvature="0" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="1.3670394"
+ inkscape:cx="367.26465"
+ inkscape:cy="258.46182"
+ inkscape:document-units="px"
+ inkscape:current-layer="g4433-6"
+ showgrid="false"
+ inkscape:window-width="1351"
+ inkscape:window-height="836"
+ inkscape:window-x="438"
+ inkscape:window-y="335"
+ inkscape:window-maximized="0"
+ fit-margin-top="5"
+ fit-margin-left="5"
+ fit-margin-right="5"
+ fit-margin-bottom="5" />
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title />
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Layer 1"
+ inkscape:groupmode="layer"
+ id="layer1"
+ transform="translate(-29.15625,-185.59375)">
+ <flowRoot
+ xml:space="preserve"
+ id="flowRoot2985"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
+ id="flowRegion2987"><rect
+ id="rect2989"
+ width="82.85714"
+ height="11.428572"
+ x="240"
+ y="492.36218" /></flowRegion><flowPara
+ id="flowPara2991" /></flowRoot> <g
+ id="g4433"
+ transform="translate(2,-12)">
+ <text
+ sodipodi:linespacing="125%"
+ id="text2993"
+ y="-261.66608"
+ x="436.12299"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ xml:space="preserve"
+ transform="matrix(0,1,-1,0,0,0)"><tspan
+ y="-261.66608"
+ x="436.12299"
+ id="tspan2995"
+ sodipodi:role="line">synchronize_rcu()</tspan></text>
+ <g
+ id="g4417"
+ transform="matrix(0,1,-1,0,730.90257,222.4928)">
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+ d="M 97.580736,477.4048 327.57913,476.09759"
+ id="path2997"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 96.752718,465.38398 0,22.62742"
+ id="path4397"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 328.40703,465.38397 0,22.62742"
+ id="path4397-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ </g>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.04738"
+ y="268.18076"
+ id="text4429"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431"
+ x="112.04738"
+ y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.04738"
+ y="487.13766"
+ id="text4441"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4443"
+ x="112.04738"
+ y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="255.60869"
+ y="297.29346"
+ id="text4445"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4447"
+ x="255.60869"
+ y="297.29346">r1 = READ_ONCE(a);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="255.14423"
+ y="554.61786"
+ id="text4449"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4451"
+ x="255.14423"
+ y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="370.71124"
+ id="text4453"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4455"
+ x="396.10254"
+ y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="572.13617"
+ id="text4457"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4459"
+ x="396.10254"
+ y="572.13617">r2 = READ_ONCE(c);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.08231"
+ y="213.91006"
+ id="text4461"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463"
+ x="112.08231"
+ y="213.91006">thread0()</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="252.34512"
+ y="213.91006"
+ id="text4461-6"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-0"
+ x="252.34512"
+ y="213.91006">thread1()</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.42557"
+ y="213.91006"
+ id="text4461-2"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-2"
+ x="396.42557"
+ y="213.91006">thread2()</tspan></text>
+ <rect
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="rect4495"
+ width="724.25244"
+ height="505.21201"
+ x="34.648232"
+ y="191.10612" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 183.14066,191.10612 0,504.24243"
+ id="path4497"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 325.13867,191.10612 0,504.24243"
+ id="path4497-5"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="111.75929"
+ y="251.53981"
+ id="text4429-8"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9"
+ x="111.75929"
+ y="251.53981">rcu_read_lock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="353.91556"
+ id="text4429-8-9"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4"
+ x="396.10254"
+ y="353.91556">rcu_read_lock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="396.10254"
+ y="587.40289"
+ id="text4429-8-9-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-4"
+ x="396.10254"
+ y="587.40289">rcu_read_unlock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="111.75929"
+ y="501.15311"
+ id="text4429-8-9-3-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-4-6"
+ x="111.75929"
+ y="501.15311">rcu_read_unlock();</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 33.941125,227.87568 724.941765,0"
+ id="path4608"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="394.94427"
+ y="331.66351"
+ id="text4648"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650"
+ x="394.94427"
+ y="331.66351">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(36.441125,185.60612)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="112.11968"
+ y="523.77856"
+ id="text4648-4"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-4"
+ x="112.11968"
+ y="523.77856">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-7"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(-246.38346,377.72117)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <path
+ sodipodi:type="arc"
+ style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-7-7"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(-103.65246,190.90878)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="254.85066"
+ y="336.96619"
+ id="text4648-4-3"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-4-5"
+ x="254.85066"
+ y="336.96619">QS</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 470.93311,190.39903 0,504.24243"
+ id="path4497-5-6"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ d="m 616.22755,190.38323 0,504.24243"
+ id="path4497-5-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <g
+ id="g4433-6"
+ transform="translate(288.0964,78.32827)">
+ <text
+ sodipodi:linespacing="125%"
+ id="text2993-7"
+ y="-261.66608"
+ x="440.12299"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ xml:space="preserve"
+ transform="matrix(0,1,-1,0,0,0)"><tspan
+ y="-261.66608"
+ x="440.12299"
+ id="tspan2995-1"
+ sodipodi:role="line">synchronize_rcu()</tspan></text>
+ <g
+ id="g4417-1"
+ transform="matrix(0,1,-1,0,730.90257,222.4928)">
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
+ d="M 97.580736,477.4048 328.5624,477.07246"
+ id="path2997-2"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 96.752718,465.38398 0,22.62742"
+ id="path4397-3"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ <path
+ style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+ d="m 329.39039,465.38397 0,22.62742"
+ id="path4397-5-4"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cc" />
+ </g>
+ </g>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="541.70508"
+ y="387.6217"
+ id="text4445-0"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4447-5"
+ x="541.70508"
+ y="387.6217">r3 = READ_ONCE(d);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="541.2406"
+ y="646.94611"
+ id="text4449-6"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4451-6"
+ x="541.2406"
+ y="646.94611">WRITE_ONCE(e, 1);</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-7-7-5"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(182.44393,281.23704)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="540.94702"
+ y="427.29443"
+ id="text4648-4-3-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-4-5-7"
+ x="540.94702"
+ y="427.29443">QS</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="686.27747"
+ y="461.83929"
+ id="text4453-7"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4455-1"
+ x="686.27747"
+ y="461.83929">r4 = READ_ONCE(b);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="686.27747"
+ y="669.26422"
+ id="text4457-9"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4459-2"
+ x="686.27747"
+ y="669.26422">r5 = READ_ONCE(e);</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="686.27747"
+ y="445.04358"
+ id="text4429-8-9-33"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-2"
+ x="686.27747"
+ y="445.04358">rcu_read_lock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="686.27747"
+ y="684.53094"
+ id="text4429-8-9-3-8"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4431-9-4-4-5"
+ x="686.27747"
+ y="684.53094">rcu_read_unlock();</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="685.11914"
+ y="422.79153"
+ id="text4648-9"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-7"
+ x="685.11914"
+ y="422.79153">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-8"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(326.61602,276.73415)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="397.85934"
+ y="609.59003"
+ id="text4648-5"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-77"
+ x="397.85934"
+ y="609.59003">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-80"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(39.356201,463.53264)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="256.75986"
+ y="586.99133"
+ id="text4648-5-2"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4650-77-7"
+ x="256.75986"
+ y="586.99133">QS</tspan></text>
+ <path
+ sodipodi:type="arc"
+ style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
+ id="path4652-80-5"
+ sodipodi:cx="358.85669"
+ sodipodi:cy="142.87541"
+ sodipodi:rx="10.960155"
+ sodipodi:ry="10.253048"
+ d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
+ transform="translate(-101.74328,440.93395)"
+ sodipodi:start="4.7135481"
+ sodipodi:end="10.994651"
+ sodipodi:open="true" />
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="546.22791"
+ y="213.91006"
+ id="text4461-2-5"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-2-6"
+ x="546.22791"
+ y="213.91006">thread3()</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
+ x="684.00067"
+ y="213.91006"
+ id="text4461-2-1"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan4463-2-0"
+ x="684.00067"
+ y="213.91006">thread4()</tspan></text>
+ </g>
+</svg>
--- /dev/null
+<!-- DO NOT HAND EDIT. -->
+<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ "http://www.w3.org/TR/html4/loose.dtd">
+ <html>
+ <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
+ <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
+
+<h1>A Tour Through RCU's Requirements</h1>
+
+<p>Copyright IBM Corporation, 2015</p>
+<p>Author: Paul E. McKenney</p>
+<p><i>The initial version of this document appeared in the
+<a href="https://lwn.net/">LWN</a> articles
+<a href="https://lwn.net/Articles/652156/">here</a>,
+<a href="https://lwn.net/Articles/652677/">here</a>, and
+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
+
+<h2>Introduction</h2>
+
+<p>
+Read-copy update (RCU) is a synchronization mechanism that is often
+used as a replacement for reader-writer locking.
+RCU is unusual in that updaters do not block readers,
+which means that RCU's read-side primitives can be exceedingly fast
+and scalable.
+In addition, updaters can make useful forward progress concurrently
+with readers.
+However, all this concurrency between RCU readers and updaters does raise
+the question of exactly what RCU readers are doing, which in turn
+raises the question of exactly what RCU's requirements are.
+
+<p>
+This document therefore summarizes RCU's requirements, and can be thought
+of as an informal, high-level specification for RCU.
+It is important to understand that RCU's specification is primarily
+empirical in nature;
+in fact, I learned about many of these requirements the hard way.
+This situation might cause some consternation, however, not only
+has this learning process been a lot of fun, but it has also been
+a great privilege to work with so many people willing to apply
+technologies in interesting new ways.
+
+<p>
+All that aside, here are the categories of currently known RCU requirements:
+</p>
+
+<ol>
+<li> <a href="#Fundamental Requirements">
+ Fundamental Requirements</a>
+<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
+<li> <a href="#Parallelism Facts of Life">
+ Parallelism Facts of Life</a>
+<li> <a href="#Quality-of-Implementation Requirements">
+ Quality-of-Implementation Requirements</a>
+<li> <a href="#Linux Kernel Complications">
+ Linux Kernel Complications</a>
+<li> <a href="#Software-Engineering Requirements">
+ Software-Engineering Requirements</a>
+<li> <a href="#Other RCU Flavors">
+ Other RCU Flavors</a>
+<li> <a href="#Possible Future Changes">
+ Possible Future Changes</a>
+</ol>
+
+<p>
+This is followed by a <a href="#Summary">summary</a>,
+which is in turn followed by the inevitable
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
+
+<p>
+RCU's fundamental requirements are the closest thing RCU has to hard
+mathematical requirements.
+These are:
+
+<ol>
+<li> <a href="#Grace-Period Guarantee">
+ Grace-Period Guarantee</a>
+<li> <a href="#Publish-Subscribe Guarantee">
+ Publish-Subscribe Guarantee</a>
+<li> <a href="#Memory-Barrier Guarantees">
+ Memory-Barrier Guarantees</a>
+<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
+ RCU Primitives Guaranteed to Execute Unconditionally</a>
+<li> <a href="#Guaranteed Read-to-Write Upgrade">
+ Guaranteed Read-to-Write Upgrade</a>
+</ol>
+
+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
+
+<p>
+RCU's grace-period guarantee is unusual in being premeditated:
+Jack Slingwine and I had this guarantee firmly in mind when we started
+work on RCU (then called “rclock”) in the early 1990s.
+That said, the past two decades of experience with RCU have produced
+a much more detailed understanding of this guarantee.
+
+<p>
+RCU's grace-period guarantee allows updaters to wait for the completion
+of all pre-existing RCU read-side critical sections.
+An RCU read-side critical section
+begins with the marker <tt>rcu_read_lock()</tt> and ends with
+the marker <tt>rcu_read_unlock()</tt>.
+These markers may be nested, and RCU treats a nested set as one
+big RCU read-side critical section.
+Production-quality implementations of <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
+fact have exactly zero overhead in Linux kernels built for production
+use with <tt>CONFIG_PREEMPT=n</tt>.
+
+<p>
+This guarantee allows ordering to be enforced with extremely low
+overhead to readers, for example:
+
+<blockquote>
+<pre>
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5 rcu_read_lock();
+ 6 r1 = READ_ONCE(x);
+ 7 r2 = READ_ONCE(y);
+ 8 rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13 WRITE_ONCE(x, 1);
+14 synchronize_rcu();
+15 WRITE_ONCE(y, 1);
+16 }
+</pre>
+</blockquote>
+
+<p>
+Because the <tt>synchronize_rcu()</tt> on line 14 waits for
+all pre-existing readers, any instance of <tt>thread0()</tt> that
+loads a value of zero from <tt>x</tt> must complete before
+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
+also load a value of zero from <tt>y</tt>.
+Similarly, any instance of <tt>thread0()</tt> that loads a value of
+one from <tt>y</tt> must have started after the
+<tt>synchronize_rcu()</tt> started, and must therefore also load
+a value of one from <tt>x</tt>.
+Therefore, the outcome:
+<blockquote>
+<pre>
+(r1 == 0 && r2 == 1)
+</pre>
+</blockquote>
+cannot happen.
+
+<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+<br><a href="#qq1answer">Answer</a>
+
+<p>
+This scenario resembles one of the first uses of RCU in
+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
+which managed a distributed lock manager's transition into
+a state suitable for handling recovery from node failure,
+more or less as follows:
+
+<blockquote>
+<pre>
+ 1 #define STATE_NORMAL 0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING 2
+ 4 #define STATE_WANT_NORMAL 3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10 int state_snap;
+11
+12 rcu_read_lock();
+13 state_snap = READ_ONCE(state);
+14 if (state_snap == STATE_NORMAL)
+15 do_something();
+16 else
+17 do_something_carefully();
+18 rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23 WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24 synchronize_rcu();
+25 WRITE_ONCE(state, STATE_RECOVERING);
+26 recovery();
+27 WRITE_ONCE(state, STATE_WANT_NORMAL);
+28 synchronize_rcu();
+29 WRITE_ONCE(state, STATE_NORMAL);
+30 }
+</pre>
+</blockquote>
+
+<p>
+The RCU read-side critical section in <tt>do_something_dlm()</tt>
+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
+to guarantee that <tt>do_something()</tt> never runs concurrently
+with <tt>recovery()</tt>, but with little or no synchronization
+overhead in <tt>do_something_dlm()</tt>.
+
+<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
+Why is the <tt>synchronize_rcu()</tt> on line 28 needed?
+<br><a href="#qq2answer">Answer</a>
+
+<p>
+In order to avoid fatal problems such as deadlocks,
+an RCU read-side critical section must not contain calls to
+<tt>synchronize_rcu()</tt>.
+Similarly, an RCU read-side critical section must not
+contain anything that waits, directly or indirectly, on completion of
+an invocation of <tt>synchronize_rcu()</tt>.
+
+<p>
+Although RCU's grace-period guarantee is useful in and of itself, with
+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
+it would be good to be able to use RCU to coordinate read-side
+access to linked data structures.
+For this, the grace-period guarantee is not sufficient, as can
+be seen in function <tt>add_gp_buggy()</tt> below.
+We will look at the reader's code later, but in the meantime, just think of
+the reader as locklessly picking up the <tt>gp</tt> pointer,
+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
+<tt>->a</tt> and <tt>->b</tt> fields.
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 p->a = a;
+12 p->b = a;
+13 gp = p; /* ORDERING BUG */
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The problem is that both the compiler and weakly ordered CPUs are within
+their rights to reorder this code as follows:
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+<b>11 gp = p; /* ORDERING BUG */
+12 p->a = a;
+13 p->b = a;</b>
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+If an RCU reader fetches <tt>gp</tt> just after
+<tt>add_gp_buggy_optimized</tt> executes line 11,
+it will see garbage in the <tt>->a</tt> and <tt>->b</tt>
+fields.
+And this is but one of many ways in which compiler and hardware optimizations
+could cause trouble.
+Therefore, we clearly need some way to prevent the compiler and the CPU from
+reordering in this manner, which brings us to the publish-subscribe
+guarantee discussed in the next section.
+
+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
+
+<p>
+RCU's publish-subscribe guarantee allows data to be inserted
+into a linked data structure without disrupting RCU readers.
+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
+new data, and readers use <tt>rcu_dereference()</tt> to
+access data, whether new or old.
+The following shows an example of insertion:
+
+<blockquote>
+<pre>
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 p->a = a;
+12 p->b = a;
+13 rcu_assign_pointer(gp, p);
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually
+equivalent to a simple assignment statement, but also guarantees
+that its assignment will
+happen after the two assignments in lines 11 and 12,
+similar to the C11 <tt>memory_order_release</tt> store operation.
+It also prevents any number of “interesting” compiler
+optimizations, for example, the use of <tt>gp</tt> as a scratch
+location immediately preceding the assignment.
+
+<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p->a</tt> and <tt>p->b</tt>
+from being reordered.
+Can't that also cause problems?
+<br><a href="#qq3answer">Answer</a>
+
+<p>
+It is tempting to assume that the reader need not do anything special
+to control its accesses to the RCU-protected data,
+as shown in <tt>do_something_gp_buggy()</tt> below:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
+ 5 if (p) {
+ 6 do_something(p->a, p->b);
+ 7 rcu_read_unlock();
+ 8 return true;
+ 9 }
+10 rcu_read_unlock();
+11 return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+However, this temptation must be resisted because there are a
+surprisingly large number of ways that the compiler
+(to say nothing of
+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
+can trip this code up.
+For but one example, if the compiler were short of registers, it
+might choose to refetch from <tt>gp</tt> rather than keeping
+a separate copy in <tt>p</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
+<b> 5 do_something(gp->a, gp->b);</b>
+ 6 rcu_read_unlock();
+ 7 return true;
+ 8 }
+ 9 rcu_read_unlock();
+10 return false;
+11 }
+</pre>
+</blockquote>
+
+<p>
+If this function ran concurrently with a series of updates that
+replaced the current structure with a new one,
+the fetches of <tt>gp->a</tt>
+and <tt>gp->b</tt> might well come from two different structures,
+which could cause serious confusion.
+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 p = rcu_dereference(gp);
+ 5 if (p) {
+ 6 do_something(p->a, p->b);
+ 7 rcu_read_unlock();
+ 8 return true;
+ 9 }
+10 rcu_read_unlock();
+11 return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
+memory barriers in the Linux kernel.
+Should a
+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
+ever appear, then <tt>rcu_dereference()</tt> could be implemented
+as a <tt>memory_order_consume</tt> load.
+Regardless of the exact implementation, a pointer fetched by
+<tt>rcu_dereference()</tt> may not be used outside of the
+outermost RCU read-side critical section containing that
+<tt>rcu_dereference()</tt>, unless protection of
+the corresponding data element has been passed from RCU to some
+other synchronization mechanism, most commonly locking or
+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
+
+<p>
+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
+use <tt>rcu_dereference()</tt>, and these two RCU API elements
+work together to ensure that readers have a consistent view of
+newly added data elements.
+
+<p>
+Of course, it is also necessary to remove elements from RCU-protected
+data structures, for example, using the following process:
+
+<ol>
+<li> Remove the data element from the enclosing structure.
+<li> Wait for all pre-existing RCU read-side critical sections
+ to complete (because only pre-existing readers can possibly have
+ a reference to the newly removed data element).
+<li> At this point, only the updater has a reference to the
+ newly removed data element, so it can safely reclaim
+ the data element, for example, by passing it to <tt>kfree()</tt>.
+</ol>
+
+This process is implemented by <tt>remove_gp_synchronous()</tt>:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3 struct foo *p;
+ 4
+ 5 spin_lock(&gp_lock);
+ 6 p = rcu_access_pointer(gp);
+ 7 if (!p) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 rcu_assign_pointer(gp, NULL);
+12 spin_unlock(&gp_lock);
+13 synchronize_rcu();
+14 kfree(p);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+This function is straightforward, with line 13 waiting for a grace
+period before line 14 frees the old data element.
+This waiting ensures that readers will reach line 7 of
+<tt>do_something_gp()</tt> before the data element referenced by
+<tt>p</tt> is freed.
+The <tt>rcu_access_pointer()</tt> on line 6 is similar to
+<tt>rcu_dereference()</tt>, except that:
+
+<ol>
+<li> The value returned by <tt>rcu_access_pointer()</tt>
+ cannot be dereferenced.
+ If you want to access the value pointed to as well as
+ the pointer itself, use <tt>rcu_dereference()</tt>
+ instead of <tt>rcu_access_pointer()</tt>.
+<li> The call to <tt>rcu_access_pointer()</tt> need not be
+ protected.
+ In contrast, <tt>rcu_dereference()</tt> must either be
+ within an RCU read-side critical section or in a code
+ segment where the pointer cannot change, for example, in
+ code protected by the corresponding update-side lock.
+</ol>
+
+<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+<br><a href="#qq4answer">Answer</a>
+
+<p>
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
+
+<p>
+The previous section's simple linked-data-structure scenario clearly
+demonstrates the need for RCU's stringent memory-ordering guarantees on
+systems with more than one CPU:
+
+<ol>
+<li> Each CPU that has an RCU read-side critical section that
+ begins before <tt>synchronize_rcu()</tt> starts is
+ guaranteed to execute a full memory barrier between the time
+ that the RCU read-side critical section ends and the time that
+ <tt>synchronize_rcu()</tt> returns.
+ Without this guarantee, a pre-existing RCU read-side critical section
+ might hold a reference to the newly removed <tt>struct foo</tt>
+ after the <tt>kfree()</tt> on line 14 of
+ <tt>remove_gp_synchronous()</tt>.
+<li> Each CPU that has an RCU read-side critical section that ends
+ after <tt>synchronize_rcu()</tt> returns is guaranteed
+ to execute a full memory barrier between the time that
+ <tt>synchronize_rcu()</tt> begins and the time that the RCU
+ read-side critical section begins.
+ Without this guarantee, a later RCU read-side critical section
+ running after the <tt>kfree()</tt> on line 14 of
+ <tt>remove_gp_synchronous()</tt> might
+ later run <tt>do_something_gp()</tt> and find the
+ newly deleted <tt>struct foo</tt>.
+<li> If the task invoking <tt>synchronize_rcu()</tt> remains
+ on a given CPU, then that CPU is guaranteed to execute a full
+ memory barrier sometime during the execution of
+ <tt>synchronize_rcu()</tt>.
+ This guarantee ensures that the <tt>kfree()</tt> on
+ line 14 of <tt>remove_gp_synchronous()</tt> really does
+ execute after the removal on line 11.
+<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
+ among a group of CPUs during that invocation, then each of the
+ CPUs in that group is guaranteed to execute a full memory barrier
+ sometime during the execution of <tt>synchronize_rcu()</tt>.
+ This guarantee also ensures that the <tt>kfree()</tt> on
+ line 14 of <tt>remove_gp_synchronous()</tt> really does
+ execute after the removal on
+ line 11, but also in the case where the thread executing the
+ <tt>synchronize_rcu()</tt> migrates in the meantime.
+</ol>
+
+<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+<br><a href="#qq5answer">Answer</a>
+
+<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+<br><a href="#qq6answer">Answer</a>
+
+<p>
+Note that these memory-barrier requirements do not replace the fundamental
+RCU requirement that a grace period wait for all pre-existing readers.
+On the contrary, the memory barriers called out in this section must operate in
+such a way as to <i>enforce</i> this fundamental requirement.
+Of course, different implementations enforce this requirement in different
+ways, but enforce it they must.
+
+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
+
+<p>
+The common-case RCU primitives are unconditional.
+They are invoked, they do their job, and they return, with no possibility
+of error, and no need to retry.
+This is a key RCU design philosophy.
+
+<p>
+However, this philosophy is pragmatic rather than pigheaded.
+If someone comes up with a good justification for a particular conditional
+RCU primitive, it might well be implemented and added.
+After all, this guarantee was reverse-engineered, not premeditated.
+The unconditional nature of the RCU primitives was initially an
+accident of implementation, and later experience with synchronization
+primitives with conditional primitives caused me to elevate this
+accident to a guarantee.
+Therefore, the justification for adding a conditional primitive to
+RCU would need to be based on detailed and compelling use cases.
+
+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
+
+<p>
+As far as RCU is concerned, it is always possible to carry out an
+update within an RCU read-side critical section.
+For example, that RCU read-side critical section might search for
+a given data element, and then might acquire the update-side
+spinlock in order to update that element, all while remaining
+in that RCU read-side critical section.
+Of course, it is necessary to exit the RCU read-side critical section
+before invoking <tt>synchronize_rcu()</tt>, however, this
+inconvenience can be avoided through use of the
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
+described later in this document.
+
+<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
+But how does the upgrade-to-write operation exclude other readers?
+<br><a href="#qq7answer">Answer</a>
+
+<p>
+This guarantee allows lookup code to be shared between read-side
+and update-side code, and was premeditated, appearing in the earliest
+DYNIX/ptx RCU documentation.
+
+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
+
+<p>
+RCU provides extremely lightweight readers, and its read-side guarantees,
+though quite useful, are correspondingly lightweight.
+It is therefore all too easy to assume that RCU is guaranteeing more
+than it really is.
+Of course, the list of things that RCU does not guarantee is infinitely
+long, however, the following sections list a few non-guarantees that
+have caused confusion.
+Except where otherwise noted, these non-guarantees were premeditated.
+
+<ol>
+<li> <a href="#Readers Impose Minimal Ordering">
+ Readers Impose Minimal Ordering</a>
+<li> <a href="#Readers Do Not Exclude Updaters">
+ Readers Do Not Exclude Updaters</a>
+<li> <a href="#Updaters Only Wait For Old Readers">
+ Updaters Only Wait For Old Readers</a>
+<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
+ Grace Periods Don't Partition Read-Side Critical Sections</a>
+<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
+ Read-Side Critical Sections Don't Partition Grace Periods</a>
+<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
+ Disabling Preemption Does Not Block Grace Periods</a>
+</ol>
+
+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
+
+<p>
+Reader-side markers such as <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
+except through their interaction with the grace-period APIs such as
+<tt>synchronize_rcu()</tt>.
+To see this, consider the following pair of threads:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(x, 1);
+ 5 rcu_read_unlock();
+ 6 rcu_read_lock();
+ 7 WRITE_ONCE(y, 1);
+ 8 rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13 rcu_read_lock();
+14 r1 = READ_ONCE(y);
+15 rcu_read_unlock();
+16 rcu_read_lock();
+17 r2 = READ_ONCE(x);
+18 rcu_read_unlock();
+19 }
+</pre>
+</blockquote>
+
+<p>
+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
+concurrently, it is quite possible to have
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 0)
+</pre>
+</blockquote>
+
+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
+which would not be possible if <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> had much in the way of ordering
+properties.
+But they do not, so the CPU is within its rights
+to do significant reordering.
+This is by design: Any significant ordering constraints would slow down
+these fast-path APIs.
+
+<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
+Can't the compiler also reorder this code?
+<br><a href="#qq8answer">Answer</a>
+
+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
+
+<p>
+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
+exclude updates.
+All they do is to prevent grace periods from ending.
+The following example illustrates this:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 r1 = READ_ONCE(y);
+ 5 if (r1) {
+ 6 do_something_with_nonzero_x();
+ 7 r2 = READ_ONCE(x);
+ 8 WARN_ON(!r2); /* BUG!!! */
+ 9 }
+10 rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15 spin_lock(&my_lock);
+16 WRITE_ONCE(x, 1);
+17 WRITE_ONCE(y, 1);
+18 spin_unlock(&my_lock);
+19 }
+</pre>
+</blockquote>
+
+<p>
+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
+excluded the <tt>thread1()</tt> function's update,
+the <tt>WARN_ON()</tt> could never fire.
+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
+much of anything aside from subsequent grace periods, of which
+<tt>thread1()</tt> has none, so the
+<tt>WARN_ON()</tt> can and does fire.
+
+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
+
+<p>
+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
+completes, there are no readers executing.
+This temptation must be avoided because
+new readers can start immediately after <tt>synchronize_rcu()</tt>
+starts, and <tt>synchronize_rcu()</tt> is under no
+obligation to wait for these new readers.
+
+<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+<br><a href="#qq9answer">Answer</a>
+
+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
+
+<p>
+It is tempting to assume that if any part of one RCU read-side critical
+section precedes a given grace period, and if any part of another RCU
+read-side critical section follows that same grace period, then all of
+the first RCU read-side critical section must precede all of the second.
+However, this just isn't the case: A single grace period does not
+partition the set of RCU read-side critical sections.
+An example of this situation can be illustrated as follows, where
+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 rcu_read_lock();
+19 r2 = READ_ONCE(b);
+20 r3 = READ_ONCE(c);
+21 rcu_read_unlock();
+22 }
+</pre>
+</blockquote>
+
+<p>
+It turns out that the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 0 && r3 == 1)
+</pre>
+</blockquote>
+
+is entirely possible.
+The following figure show how this can happen, with each circled
+<tt>QS</tt> indicating the point at which RCU recorded a
+<i>quiescent state</i> for each thread, that is, a state in which
+RCU knows that the thread cannot be in the midst of an RCU read-side
+critical section that started before the current grace period:
+
+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
+
+<p>
+If it is necessary to partition RCU read-side critical sections in this
+manner, it is necessary to use two grace periods, where the first
+grace period is known to end before the second grace period starts:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 r2 = READ_ONCE(c);
+19 synchronize_rcu();
+20 WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25 rcu_read_lock();
+26 r3 = READ_ONCE(b);
+27 r4 = READ_ONCE(d);
+28 rcu_read_unlock();
+29 }
+</pre>
+</blockquote>
+
+<p>
+Here, if <tt>(r1 == 1)</tt>, then
+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
+before the end of <tt>thread1()</tt>'s grace period.
+If in addition <tt>(r4 == 1)</tt>, then
+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
+after the beginning of <tt>thread2()</tt>'s grace period.
+If it is also the case that <tt>(r2 == 1)</tt>, then the
+end of <tt>thread1()</tt>'s grace period must precede the
+beginning of <tt>thread2()</tt>'s grace period.
+This mean that the two RCU read-side critical sections cannot overlap,
+guaranteeing that <tt>(r3 == 1)</tt>.
+As a result, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+</pre>
+</blockquote>
+
+cannot happen.
+
+<p>
+This non-requirement was also non-premeditated, but became apparent
+when studying RCU's interaction with memory ordering.
+
+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
+
+<p>
+It is also tempting to assume that if an RCU read-side critical section
+happens between a pair of grace periods, then those grace periods cannot
+overlap.
+However, this temptation leads nowhere good, as can be illustrated by
+the following, with all variables initially zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 rcu_read_lock();
+19 WRITE_ONCE(d, 1);
+20 r2 = READ_ONCE(c);
+21 rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26 r3 = READ_ONCE(d);
+27 synchronize_rcu();
+28 WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33 rcu_read_lock();
+34 r4 = READ_ONCE(b);
+35 r5 = READ_ONCE(e);
+36 rcu_read_unlock();
+37 }
+</pre>
+</blockquote>
+
+<p>
+In this case, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+</pre>
+</blockquote>
+
+is entirely possible, as illustrated below:
+
+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
+
+<p>
+Again, an RCU read-side critical section can overlap almost all of a
+given grace period, just so long as it does not overlap the entire
+grace period.
+As a result, an RCU read-side critical section cannot partition a pair
+of RCU grace periods.
+
+<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+<br><a href="#qq10answer">Answer</a>
+
+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
+Disabling Preemption Does Not Block Grace Periods</a></h3>
+
+<p>
+There was a time when disabling preemption on any given CPU would block
+subsequent grace periods.
+However, this was an accident of implementation and is not a requirement.
+And in the current Linux-kernel implementation, disabling preemption
+on a given CPU in fact does not block grace periods, as Oleg Nesterov
+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
+
+<p>
+If you need a preempt-disable region to block grace periods, you need to add
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
+as follows:
+
+<blockquote>
+<pre>
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+</pre>
+</blockquote>
+
+<p>
+In theory, you could enter the RCU read-side critical section first,
+but it is more efficient to keep the entire RCU read-side critical
+section contained in the preempt-disable region as shown above.
+Of course, RCU read-side critical sections that extend outside of
+preempt-disable regions will work correctly, but such critical sections
+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
+more work.
+And no, this is <i>not</i> an invitation to enclose all of your RCU
+read-side critical sections within preempt-disable regions, because
+doing so would degrade real-time response.
+
+<p>
+This non-requirement appeared with preemptible RCU.
+If you need a grace period that waits on non-preemptible code regions, use
+<a href="#Sched Flavor">RCU-sched</a>.
+
+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
+
+<p>
+These parallelism facts of life are by no means specific to RCU, but
+the RCU implementation must abide by them.
+They therefore bear repeating:
+
+<ol>
+<li> Any CPU or task may be delayed at any time,
+ and any attempts to avoid these delays by disabling
+ preemption, interrupts, or whatever are completely futile.
+ This is most obvious in preemptible user-level
+ environments and in virtualized environments (where
+ a given guest OS's VCPUs can be preempted at any time by
+ the underlying hypervisor), but can also happen in bare-metal
+ environments due to ECC errors, NMIs, and other hardware
+ events.
+ Although a delay of more than about 20 seconds can result
+ in splats, the RCU implementation is obligated to use
+ algorithms that can tolerate extremely long delays, but where
+ “extremely long” is not long enough to allow
+ wrap-around when incrementing a 64-bit counter.
+<li> Both the compiler and the CPU can reorder memory accesses.
+ Where it matters, RCU must use compiler directives and
+ memory-barrier instructions to preserve ordering.
+<li> Conflicting writes to memory locations in any given cache line
+ will result in expensive cache misses.
+ Greater numbers of concurrent writes and more-frequent
+ concurrent writes will result in more dramatic slowdowns.
+ RCU is therefore obligated to use algorithms that have
+ sufficient locality to avoid significant performance and
+ scalability problems.
+<li> As a rough rule of thumb, only one CPU's worth of processing
+ may be carried out under the protection of any given exclusive
+ lock.
+ RCU must therefore use scalable locking designs.
+<li> Counters are finite, especially on 32-bit systems.
+ RCU's use of counters must therefore tolerate counter wrap,
+ or be designed such that counter wrap would take way more
+ time than a single system is likely to run.
+ An uptime of ten years is quite possible, a runtime
+ of a century much less so.
+ As an example of the latter, RCU's dyntick-idle nesting counter
+ allows 54 bits for interrupt nesting level (this counter
+ is 64 bits even on a 32-bit system).
+ Overflowing this counter requires 2<sup>54</sup>
+ half-interrupts on a given CPU without that CPU ever going idle.
+ If a half-interrupt happened every microsecond, it would take
+ 570 years of runtime to overflow this counter, which is currently
+ believed to be an acceptably long time.
+<li> Linux systems can have thousands of CPUs running a single
+ Linux kernel in a single shared-memory environment.
+ RCU must therefore pay close attention to high-end scalability.
+</ol>
+
+<p>
+This last parallelism fact of life means that RCU must pay special
+attention to the preceding facts of life.
+The idea that Linux might scale to systems with thousands of CPUs would
+have been met with some skepticism in the 1990s, but these requirements
+would have otherwise have been unsurprising, even in the early 1990s.
+
+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
+
+<p>
+These sections list quality-of-implementation requirements.
+Although an RCU implementation that ignores these requirements could
+still be used, it would likely be subject to limitations that would
+make it inappropriate for industrial-strength production use.
+Classes of quality-of-implementation requirements are as follows:
+
+<ol>
+<li> <a href="#Specialization">Specialization</a>
+<li> <a href="#Performance and Scalability">Performance and Scalability</a>
+<li> <a href="#Composability">Composability</a>
+<li> <a href="#Corner Cases">Corner Cases</a>
+</ol>
+
+<p>
+These classes is covered in the following sections.
+
+<h3><a name="Specialization">Specialization</a></h3>
+
+<p>
+RCU is and always has been intended primarily for read-mostly situations, as
+illustrated by the following figure.
+This means that RCU's read-side primitives are optimized, often at the
+expense of its update-side primitives.
+
+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+
+<p>
+This focus on read-mostly situations means that RCU must interoperate
+with other synchronization primitives.
+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
+examples discussed earlier use RCU to protect readers and locking to
+coordinate updaters.
+However, the need extends much farther, requiring that a variety of
+synchronization primitives be legal within RCU read-side critical sections,
+including spinlocks, sequence locks, atomic operations, reference
+counters, and memory barriers.
+
+<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
+What about sleeping locks?
+<br><a href="#qq11answer">Answer</a>
+
+<p>
+It often comes as a surprise that many algorithms do not require a
+consistent view of data, but many can function in that mode,
+with network routing being the poster child.
+Internet routing algorithms take significant time to propagate
+updates, so that by the time an update arrives at a given system,
+that system has been sending network traffic the wrong way for
+a considerable length of time.
+Having a few threads continue to send traffic the wrong way for a
+few more milliseconds is clearly not a problem: In the worst case,
+TCP retransmissions will eventually get the data where it needs to go.
+In general, when tracking the state of the universe outside of the
+computer, some level of inconsistency must be tolerated due to
+speed-of-light delays if nothing else.
+
+<p>
+Furthermore, uncertainty about external state is inherent in many cases.
+For example, a pair of veternarians might use heartbeat to determine
+whether or not a given cat was alive.
+But how long should they wait after the last heartbeat to decide that
+the cat is in fact dead?
+Waiting less than 400 milliseconds makes no sense because this would
+mean that a relaxed cat would be considered to cycle between death
+and life more than 100 times per minute.
+Moreover, just as with human beings, a cat's heart might stop for
+some period of time, so the exact wait period is a judgment call.
+One of our pair of veternarians might wait 30 seconds before pronouncing
+the cat dead, while the other might insist on waiting a full minute.
+The two veternarians would then disagree on the state of the cat during
+the final 30 seconds of the minute following the last heartbeat, as
+fancifully illustrated below:
+
+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+
+<p>
+Interestingly enough, this same situation applies to hardware.
+When push comes to shove, how do we tell whether or not some
+external server has failed?
+We send messages to it periodically, and declare it failed if we
+don't receive a response within a given period of time.
+Policy decisions can usually tolerate short
+periods of inconsistency.
+The policy was decided some time ago, and is only now being put into
+effect, so a few milliseconds of delay is normally inconsequential.
+
+<p>
+However, there are algorithms that absolutely must see consistent data.
+For example, the translation between a user-level SystemV semaphore
+ID to the corresponding in-kernel data structure is protected by RCU,
+but it is absolutely forbidden to update a semaphore that has just been
+removed.
+In the Linux kernel, this need for consistency is accommodated by acquiring
+spinlocks located in the in-kernel data structure from within
+the RCU read-side critical section, and this is indicated by the
+green box in the figure above.
+Many other techniques may be used, and are in fact used within the
+Linux kernel.
+
+<p>
+In short, RCU is not required to maintain consistency, and other
+mechanisms may be used in concert with RCU when consistency is required.
+RCU's specialization allows it to do its job extremely well, and its
+ability to interoperate with other synchronization mechanisms allows
+the right mix of synchronization tools to be used for a given job.
+
+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
+
+<p>
+Energy efficiency is a critical component of performance today,
+and Linux-kernel RCU implementations must therefore avoid unnecessarily
+awakening idle CPUs.
+I cannot claim that this requirement was premeditated.
+In fact, I learned of it during a telephone conversation in which I
+was given “frank and open” feedback on the importance
+of energy efficiency in battery-powered systems and on specific
+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
+In my experience, the battery-powered embedded community will consider
+any unnecessary wakeups to be extremely unfriendly acts.
+So much so that mere Linux-kernel-mailing-list posts are
+insufficient to vent their ire.
+
+<p>
+Memory consumption is not particularly important for in most
+situations, and has become decreasingly
+so as memory sizes have expanded and memory
+costs have plummeted.
+However, as I learned from Matt Mackall's
+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
+efforts, memory footprint is critically important on single-CPU systems with
+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
+was born.
+Josh Triplett has since taken over the small-memory banner with his
+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
+project, which resulted in
+<a href="#Sleepable RCU">SRCU</a>
+becoming optional for those kernels not needing it.
+
+<p>
+The remaining performance requirements are, for the most part,
+unsurprising.
+For example, in keeping with RCU's read-side specialization,
+<tt>rcu_dereference()</tt> should have negligible overhead (for
+example, suppression of a few minor compiler optimizations).
+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
+
+<p>
+In preemptible environments, in the case where the RCU read-side
+critical section was not preempted (as will be the case for the
+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have minimal overhead.
+In particular, they should not contain atomic read-modify-write
+operations, memory-barrier instructions, preemption disabling,
+interrupt disabling, or backwards branches.
+However, in the case where the RCU read-side critical section was preempted,
+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
+This is why it is better to nest an RCU read-side critical section
+within a preempt-disable region than vice versa, at least in cases
+where that critical section is short enough to avoid unduly degrading
+real-time latencies.
+
+<p>
+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
+optimized for throughput.
+It may therefore incur several milliseconds of latency in addition to
+the duration of the longest RCU read-side critical section.
+On the other hand, multiple concurrent invocations of
+<tt>synchronize_rcu()</tt> are required to use batching optimizations
+so that they can be satisfied by a single underlying grace-period-wait
+operation.
+For example, in the Linux kernel, it is not unusual for a single
+grace-period-wait operation to serve more than
+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
+overhead down to nearly zero.
+However, the grace-period optimization is also required to avoid
+measurable degradation of real-time scheduling and interrupt latencies.
+
+<p>
+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
+latencies are unacceptable.
+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
+instead, reducing the grace-period latency down to a few tens of
+microseconds on small systems, at least in cases where the RCU read-side
+critical sections are short.
+There are currently no special latency requirements for
+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
+consistent with the empirical nature of the RCU specification,
+that is subject to change.
+However, there most definitely are scalability requirements:
+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
+CPUs should at least make reasonable forward progress.
+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
+is permitted to impose modest degradation of real-time latency
+on non-idle online CPUs.
+That said, it will likely be necessary to take further steps to reduce this
+degradation, hopefully to roughly that of a scheduling-clock interrupt.
+
+<p>
+There are a number of situations where even
+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
+latency is unacceptable.
+In these situations, the asynchronous <tt>call_rcu()</tt> can be
+used in place of <tt>synchronize_rcu()</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2 int a;
+ 3 int b;
+ 4 struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9 struct foo *p = container_of(rhp, struct foo, rh);
+10
+11 kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16 struct foo *p;
+17
+18 spin_lock(&gp_lock);
+19 p = rcu_dereference(gp);
+20 if (!p) {
+21 spin_unlock(&gp_lock);
+22 return false;
+23 }
+24 rcu_assign_pointer(gp, NULL);
+25 call_rcu(&p->rh, remove_gp_cb);
+26 spin_unlock(&gp_lock);
+27 return true;
+28 }
+</pre>
+</blockquote>
+
+<p>
+A definition of <tt>struct foo</tt> is finally needed, and appears
+on lines 1-5.
+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
+on line 25, and will be invoked after the end of a subsequent
+grace period.
+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
+but without forcing the updater to wait for a grace period to elapse.
+The <tt>call_rcu()</tt> function may be used in a number of
+situations where neither <tt>synchronize_rcu()</tt> nor
+<tt>synchronize_rcu_expedited()</tt> would be legal,
+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
+interrupt-disable code, and interrupt handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
+executed within softirq (software interrupt) environment within the
+Linux kernel,
+either within a real softirq handler or under the protection
+of <tt>local_bh_disable()</tt>.
+In both the Linux kernel and in userspace, it is bad practice to
+write an RCU callback function that takes too long.
+Long-running operations should be relegated to separate threads or
+(in the Linux kernel) workqueues.
+
+<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
+Why does line 19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line 25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+<br><a href="#qq12answer">Answer</a>
+
+<p>
+However, all that <tt>remove_gp_cb()</tt> is doing is
+invoking <tt>kfree()</tt> on the data element.
+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
+which allows “fire and forget” operation as shown below:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2 int a;
+ 3 int b;
+ 4 struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9 struct foo *p;
+10
+11 spin_lock(&gp_lock);
+12 p = rcu_dereference(gp);
+13 if (!p) {
+14 spin_unlock(&gp_lock);
+15 return false;
+16 }
+17 rcu_assign_pointer(gp, NULL);
+18 kfree_rcu(p, rh);
+19 spin_unlock(&gp_lock);
+20 return true;
+21 }
+</pre>
+</blockquote>
+
+<p>
+Note that <tt>remove_gp_faf()</tt> simply invokes
+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
+further attention to the subsequent grace period and <tt>kfree()</tt>.
+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
+environments as for <tt>call_rcu()</tt>.
+Interestingly enough, DYNIX/ptx had the equivalents of
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
+<tt>synchronize_rcu()</tt>.
+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
+so the very few places that needed something like
+<tt>synchronize_rcu()</tt> simply open-coded it.
+
+<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+<br><a href="#qq13answer">Answer</a>
+
+<p>
+But what if the updater must wait for the completion of code to be
+executed after the end of the grace period, but has other tasks
+that can be carried out in the meantime?
+The polling-style <tt>get_state_synchronize_rcu()</tt> and
+<tt>cond_synchronize_rcu()</tt> functions may be used for this
+purpose, as shown below:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3 struct foo *p;
+ 4 unsigned long s;
+ 5
+ 6 spin_lock(&gp_lock);
+ 7 p = rcu_access_pointer(gp);
+ 8 if (!p) {
+ 9 spin_unlock(&gp_lock);
+10 return false;
+11 }
+12 rcu_assign_pointer(gp, NULL);
+13 spin_unlock(&gp_lock);
+14 s = get_state_synchronize_rcu();
+15 do_something_while_waiting();
+16 cond_synchronize_rcu(s);
+17 kfree(p);
+18 return true;
+19 }
+</pre>
+</blockquote>
+
+<p>
+On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a
+“cookie” from RCU,
+then line 15 carries out other tasks,
+and finally, line 16 returns immediately if a grace period has
+elapsed in the meantime, but otherwise waits as required.
+The need for <tt>get_state_synchronize_rcu</tt> and
+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
+so it is too early to tell whether they will stand the test of time.
+
+<p>
+RCU thus provides a range of tools to allow updaters to strike the
+required tradeoff between latency, flexibility and CPU overhead.
+
+<h3><a name="Composability">Composability</a></h3>
+
+<p>
+Composability has received much attention in recent years, perhaps in part
+due to the collision of multicore hardware with object-oriented techniques
+designed in single-threaded environments for single-threaded use.
+And in theory, RCU read-side critical sections may be composed, and in
+fact may be nested arbitrarily deeply.
+In practice, as with all real-world implementations of composable
+constructs, there are limitations.
+
+<p>
+Implementations of RCU for which <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> generate no code, such as
+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
+nested arbitrarily deeply.
+After all, there is no overhead.
+Except that if all these instances of <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
+compilation will eventually fail due to exhausting memory,
+mass storage, or user patience, whichever comes first.
+If the nesting is not visible to the compiler, as is the case with
+mutually recursive functions each in its own translation unit,
+stack overflow will result.
+If the nesting takes the form of loops, either the control variable
+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
+Nevertheless, this class of RCU implementations is one
+of the most composable constructs in existence.
+
+<p>
+RCU implementations that explicitly track nesting depth
+are limited by the nesting-depth counter.
+For example, the Linux kernel's preemptible RCU limits nesting to
+<tt>INT_MAX</tt>.
+This should suffice for almost all practical purposes.
+That said, a consecutive pair of RCU read-side critical sections
+between which there is an operation that waits for a grace period
+cannot be enclosed in another RCU read-side critical section.
+This is because it is not legal to wait for a grace period within
+an RCU read-side critical section: To do so would result either
+in deadlock or
+in RCU implicitly splitting the enclosing RCU read-side critical
+section, neither of which is conducive to a long-lived and prosperous
+kernel.
+
+<p>
+It is worth noting that RCU is not alone in limiting composability.
+For example, many transactional-memory implementations prohibit
+composing a pair of transactions separated by an irrevocable
+operation (for example, a network receive operation).
+For another example, lock-based critical sections can be composed
+surprisingly freely, but only if deadlock is avoided.
+
+<p>
+In short, although RCU read-side critical sections are highly composable,
+care is required in some situations, just as is the case for any other
+composable synchronization mechanism.
+
+<h3><a name="Corner Cases">Corner Cases</a></h3>
+
+<p>
+A given RCU workload might have an endless and intense stream of
+RCU read-side critical sections, perhaps even so intense that there
+was never a point in time during which there was not at least one
+RCU read-side critical section in flight.
+RCU cannot allow this situation to block grace periods: As long as
+all the RCU read-side critical sections are finite, grace periods
+must also be finite.
+
+<p>
+That said, preemptible RCU implementations could potentially result
+in RCU read-side critical sections being preempted for long durations,
+which has the effect of creating a long-duration RCU read-side
+critical section.
+This situation can arise only in heavily loaded systems, but systems using
+real-time priorities are of course more vulnerable.
+Therefore, RCU priority boosting is provided to help deal with this
+case.
+That said, the exact requirements on RCU priority boosting will likely
+evolve as more experience accumulates.
+
+<p>
+Other workloads might have very high update rates.
+Although one can argue that such workloads should instead use
+something other than RCU, the fact remains that RCU must
+handle such workloads gracefully.
+This requirement is another factor driving batching of grace periods,
+but it is also the driving force behind the checks for large numbers
+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
+Finally, high update rates should not delay RCU read-side critical
+sections, although some read-side delays can occur when using
+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
+of <tt>try_stop_cpus()</tt>.
+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
+converted to use lighter-weight inter-processor interrupts (IPIs),
+but this will still disturb readers, though to a much smaller degree.)
+
+<p>
+Although all three of these corner cases were understood in the early
+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
+in a tight loop
+in the early 2000s suddenly provided a much deeper appreciation of the
+high-update-rate corner case.
+This test also motivated addition of some RCU code to react to high update
+rates, for example, if a given CPU finds itself with more than 10,000
+RCU callbacks queued, it will cause RCU to take evasive action by
+more aggressively starting grace periods and more aggressively forcing
+completion of grace-period processing.
+This evasive action causes the grace period to complete more quickly,
+but at the cost of restricting RCU's batching optimizations, thus
+increasing the CPU overhead incurred by that grace period.
+
+<h2><a name="Software-Engineering Requirements">
+Software-Engineering Requirements</a></h2>
+
+<p>
+Between Murphy's Law and “To err is human”, it is necessary to
+guard against mishaps and misuse:
+
+<ol>
+<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
+ everywhere that it is needed, so kernels built with
+ <tt>CONFIG_PROVE_RCU=y</tt> will spat if
+ <tt>rcu_dereference()</tt> is used outside of an
+ RCU read-side critical section.
+ Update-side code can use <tt>rcu_dereference_protected()</tt>,
+ which takes a
+ <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
+ to indicate what is providing the protection.
+ If the indicated protection is not provided, a lockdep splat
+ is emitted.
+
+ <p>
+ Code shared between readers and updaters can use
+ <tt>rcu_dereference_check()</tt>, which also takes a
+ lockdep expression, and emits a lockdep splat if neither
+ <tt>rcu_read_lock()</tt> nor the indicated protection
+ is in place.
+ In addition, <tt>rcu_dereference_raw()</tt> is used in those
+ (hopefully rare) cases where the required protection cannot
+ be easily described.
+ Finally, <tt>rcu_read_lock_held()</tt> is provided to
+ allow a function to verify that it has been invoked within
+ an RCU read-side critical section.
+ I was made aware of this set of requirements shortly after Thomas
+ Gleixner audited a number of RCU uses.
+<li> A given function might wish to check for RCU-related preconditions
+ upon entry, before using any other RCU API.
+ The <tt>rcu_lockdep_assert()</tt> does this job,
+ asserting the expression in kernels having lockdep enabled
+ and doing nothing otherwise.
+<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
+ and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
+ substituting a simple assignment.
+ To catch this sort of error, a given RCU-protected pointer may be
+ tagged with <tt>__rcu</tt>, after which running sparse
+ with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
+ about simple-assignment accesses to that pointer.
+ Arnd Bergmann made me aware of this requirement, and also
+ supplied the needed
+ <a href="https://lwn.net/Articles/376011/">patch series</a>.
+<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
+ will splat if a data element is passed to <tt>call_rcu()</tt>
+ twice in a row, without a grace period in between.
+ (This error is similar to a double free.)
+ The corresponding <tt>rcu_head</tt> structures that are
+ dynamically allocated are automatically tracked, but
+ <tt>rcu_head</tt> structures allocated on the stack
+ must be initialized with <tt>init_rcu_head_on_stack()</tt>
+ and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
+ Similarly, statically allocated non-stack <tt>rcu_head</tt>
+ structures must be initialized with <tt>init_rcu_head()</tt>
+ and cleaned up with <tt>destroy_rcu_head()</tt>.
+ Mathieu Desnoyers made me aware of this requirement, and also
+ supplied the needed
+ <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
+<li> An infinite loop in an RCU read-side critical section will
+ eventually trigger an RCU CPU stall warning splat, with
+ the duration of “eventually” being controlled by the
+ <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
+ alternatively, by the
+ <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
+ parameter.
+ However, RCU is not obligated to produce this splat
+ unless there is a grace period waiting on that particular
+ RCU read-side critical section.
+ <p>
+ Some extreme workloads might intentionally delay
+ RCU grace periods, and systems running those workloads can
+ be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
+ to suppress the splats.
+ This kernel parameter may also be set via <tt>sysfs</tt>.
+ Furthermore, RCU CPU stall warnings are counter-productive
+ during sysrq dumps and during panics.
+ RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
+ <tt>rcu_sysrq_end()</tt> API members to be called before
+ and after long sysrq dumps.
+ RCU also supplies the <tt>rcu_panic()</tt> notifier that is
+ automatically invoked at the beginning of a panic to suppress
+ further RCU CPU stall warnings.
+
+ <p>
+ This requirement made itself known in the early 1990s, pretty
+ much the first time that it was necessary to debug a CPU stall.
+ That said, the initial implementation in DYNIX/ptx was quite
+ generic in comparison with that of Linux.
+<li> Although it would be very good to detect pointers leaking out
+ of RCU read-side critical sections, there is currently no
+ good way of doing this.
+ One complication is the need to distinguish between pointers
+ leaking and pointers that have been handed off from RCU to
+ some other synchronization mechanism, for example, reference
+ counting.
+<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
+ information is provided via both debugfs and event tracing.
+<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
+ <tt>rcu_dereference()</tt> to create typical linked
+ data structures can be surprisingly error-prone.
+ Therefore, RCU-protected
+ <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
+ and, more recently, RCU-protected
+ <a href="https://lwn.net/Articles/612100/">hash tables</a>
+ are available.
+ Many other special-purpose RCU-protected data structures are
+ available in the Linux kernel and the userspace RCU library.
+<li> Some linked structures are created at compile time, but still
+ require <tt>__rcu</tt> checking.
+ The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
+ purpose.
+<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
+ when creating linked structures that are to be published via
+ a single external pointer.
+ The <tt>RCU_INIT_POINTER()</tt> macro is provided for
+ this task and also for assigning <tt>NULL</tt> pointers
+ at runtime.
+</ol>
+
+<p>
+This not a hard-and-fast list: RCU's diagnostic capabilities will
+continue to be guided by the number and type of usage bugs found
+in real-world RCU usage.
+
+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
+
+<p>
+The Linux kernel provides an interesting environment for all kinds of
+software, including RCU.
+Some of the relevant points of interest are as follows:
+
+<ol>
+<li> <a href="#Configuration">Configuration</a>.
+<li> <a href="#Firmware Interface">Firmware Interface</a>.
+<li> <a href="#Early Boot">Early Boot</a>.
+<li> <a href="#Interrupts and NMIs">
+ Interrupts and non-maskable interrupts (NMIs)</a>.
+<li> <a href="#Loadable Modules">Loadable Modules</a>.
+<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
+<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
+<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
+<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
+<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
+<li> <a href="#Performance, Scalability, Response Time, and Reliability">
+ Performance, Scalability, Response Time, and Reliability</a>.
+</ol>
+
+<p>
+This list is probably incomplete, but it does give a feel for the
+most notable Linux-kernel complications.
+Each of the following sections covers one of the above topics.
+
+<h3><a name="Configuration">Configuration</a></h3>
+
+<p>
+RCU's goal is automatic configuration, so that almost nobody
+needs to worry about RCU's <tt>Kconfig</tt> options.
+And for almost all users, RCU does in fact work well
+“out of the box.”
+
+<p>
+However, there are specialized use cases that are handled by
+kernel boot parameters and <tt>Kconfig</tt> options.
+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
+about new <tt>Kconfig</tt> options, which requires almost all of them
+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
+
+<p>
+This all should be quite obvious, but the fact remains that
+Linus Torvalds recently had to
+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
+me of this requirement.
+
+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
+
+<p>
+In many cases, kernel obtains information about the system from the
+firmware, and sometimes things are lost in translation.
+Or the translation is accurate, but the original message is bogus.
+
+<p>
+For example, some systems' firmware overreports the number of CPUs,
+sometimes by a large factor.
+If RCU naively believed the firmware, as it used to do,
+it would create too many per-CPU kthreads.
+Although the resulting system will still run correctly, the extra
+kthreads needlessly consume memory and can cause confusion
+when they show up in <tt>ps</tt> listings.
+
+<p>
+RCU must therefore wait for a given CPU to actually come online before
+it can allow itself to believe that the CPU actually exists.
+The resulting “ghost CPUs” (which are never going to
+come online) cause a number of
+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
+
+<h3><a name="Early Boot">Early Boot</a></h3>
+
+<p>
+The Linux kernel's boot sequence is an interesting process,
+and RCU is used early, even before <tt>rcu_init()</tt>
+is invoked.
+In fact, a number of RCU's primitives can be used as soon as the
+initial task's <tt>task_struct</tt> is available and the
+boot CPU's per-CPU variables are set up.
+The read-side primitives (<tt>rcu_read_lock()</tt>,
+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
+as will <tt>rcu_assign_pointer()</tt>.
+
+<p>
+Although <tt>call_rcu()</tt> may be invoked at any
+time during boot, callbacks are not guaranteed to be invoked until after
+the scheduler is fully up and running.
+This delay in callback invocation is due to the fact that RCU does not
+invoke callbacks until it is fully initialized, and this full initialization
+cannot occur until after the scheduler has initialized itself to the
+point where RCU can spawn and run its kthreads.
+In theory, it would be possible to invoke callbacks earlier,
+however, this is not a panacea because there would be severe restrictions
+on what operations those callbacks could invoke.
+
+<p>
+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
+(<a href="#Bottom-Half Flavor">discussed below</a>),
+and
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+will all operate normally
+during very early boot, the reason being that there is only one CPU
+and preemption is disabled.
+This means that the call <tt>synchronize_rcu()</tt> (or friends)
+itself is a quiescent
+state and thus a grace period, so the early-boot implementation can
+be a no-op.
+
+<p>
+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
+continue to operate normally through the remainder of boot, courtesy
+of the fact that preemption is disabled across their RCU read-side
+critical sections and also courtesy of the fact that there is still
+only one CPU.
+However, once the scheduler starts initializing, preemption is enabled.
+There is still only a single CPU, but the fact that preemption is enabled
+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
+Therefore, as soon as the scheduler starts initializing, the early-boot
+fastpath is disabled.
+This means that <tt>synchronize_rcu()</tt> switches to its runtime
+mode of operation where it posts callbacks, which in turn means that
+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
+callback is invoked.
+Unfortunately, the callback cannot be invoked until RCU's runtime
+grace-period machinery is up and running, which cannot happen until
+the scheduler has initialized itself sufficiently to allow RCU's
+kthreads to be spawned.
+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
+initialization can result in deadlock.
+
+<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+<br><a href="#qq14answer">Answer</a>
+
+<p>
+I learned of these boot-time requirements as a result of a series of
+system hangs.
+
+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
+
+<p>
+The Linux kernel has interrupts, and RCU read-side critical sections are
+legal within interrupt handlers and within interrupt-disabled regions
+of code, as are invocations of <tt>call_rcu()</tt>.
+
+<p>
+Some Linux-kernel architectures can enter an interrupt handler from
+non-idle process context, and then just never leave it, instead stealthily
+transitioning back to process context.
+This trick is sometimes used to invoke system calls from inside the kernel.
+These “half-interrupts” mean that RCU has to be very careful
+about how it counts interrupt nesting levels.
+I learned of this requirement the hard way during a rewrite
+of RCU's dyntick-idle code.
+
+<p>
+The Linux kernel has non-maskable interrupts (NMIs), and
+RCU read-side critical sections are legal within NMI handlers.
+Thankfully, RCU update-side primitives, including
+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
+
+<p>
+The name notwithstanding, some Linux-kernel architectures
+can have nested NMIs, which RCU must handle correctly.
+Andy Lutomirski
+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
+with this requirement;
+he also kindly surprised me with
+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
+that meets this requirement.
+
+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
+
+<p>
+The Linux kernel has loadable modules, and these modules can
+also be unloaded.
+After a given module has been unloaded, any attempt to call
+one of its functions results in a segmentation fault.
+The module-unload functions must therefore cancel any
+delayed calls to loadable-module functions, for example,
+any outstanding <tt>mod_timer()</tt> must be dealt with
+via <tt>del_timer_sync()</tt> or similar.
+
+<p>
+Unfortunately, there is no way to cancel an RCU callback;
+once you invoke <tt>call_rcu()</tt>, the callback function is
+going to eventually be invoked, unless the system goes down first.
+Because it is normally considered socially irresponsible to crash the system
+in response to a module unload request, we need some other way
+to deal with in-flight RCU callbacks.
+
+<p>
+RCU therefore provides
+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
+which waits until all in-flight RCU callbacks have been invoked.
+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
+<tt>rcu_barrier()</tt>.
+In theory, the underlying module-unload code could invoke
+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
+incur unacceptable latencies.
+
+<p>
+Nikita Danilov noted this requirement for an analogous filesystem-unmount
+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
+The need for <tt>rcu_barrier()</tt> for module unloading became
+apparent later.
+
+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
+
+<p>
+The Linux kernel supports CPU hotplug, which means that CPUs
+can come and go.
+It is of course illegal to use any RCU API member from an offline CPU.
+This requirement was present from day one in DYNIX/ptx, but
+on the other hand, the Linux kernel's CPU-hotplug implementation
+is “interesting.”
+
+<p>
+The Linux-kernel CPU-hotplug implementation has notifiers that
+are used to allow the various kernel subsystems (including RCU)
+to respond appropriately to a given CPU-hotplug operation.
+Most RCU operations may be invoked from CPU-hotplug notifiers,
+including even normal synchronous grace-period operations
+such as <tt>synchronize_rcu()</tt>.
+However, expedited grace-period operations such as
+<tt>synchronize_rcu_expedited()</tt> are not supported,
+due to the fact that current implementations block CPU-hotplug
+operations, which could result in deadlock.
+
+<p>
+In addition, all-callback-wait operations such as
+<tt>rcu_barrier()</tt> are also not supported, due to the
+fact that there are phases of CPU-hotplug operations where
+the outgoing CPU's callbacks will not be invoked until after
+the CPU-hotplug operation ends, which could also result in deadlock.
+
+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
+
+<p>
+RCU depends on the scheduler, and the scheduler uses RCU to
+protect some of its data structures.
+This means the scheduler is forbidden from acquiring
+the runqueue locks and the priority-inheritance locks
+in the middle of an outermost RCU read-side critical section unless either
+(1) it releases them before exiting that same
+RCU read-side critical section, or
+(2) interrupts are disabled across
+that entire RCU read-side critical section.
+This same prohibition also applies (recursively!) to any lock that is acquired
+while holding any lock to which this prohibition applies.
+Adhering to this rule prevents preemptible RCU from invoking
+<tt>rcu_read_unlock_special()</tt> while either runqueue or
+priority-inheritance locks are held, thus avoiding deadlock.
+
+<p>
+Prior to v4.4, it was only necessary to disable preemption across
+RCU read-side critical sections that acquired scheduler locks.
+In v4.4, expedited grace periods started using IPIs, and these
+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
+Therefore, this expedited-grace-period change required disabling of
+interrupts, not just preemption.
+
+<p>
+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
+implementation must be written carefully to avoid similar deadlocks.
+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
+interrupt where the interrupt handler invokes both
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+This possibility requires <tt>rcu_read_unlock()</tt> to use
+negative nesting levels to avoid destructive recursion via
+interrupt handler's use of RCU.
+
+<p>
+This pair of mutual scheduler-RCU requirements came as a
+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
+
+<p>
+As noted above, RCU makes use of kthreads, and it is necessary to
+avoid excessive CPU-time accumulation by these kthreads.
+This requirement was no surprise, but RCU's violation of it
+when running context-switch-heavy workloads when built with
+<tt>CONFIG_NO_HZ_FULL=y</tt>
+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
+RCU has made good progress towards meeting this requirement, even
+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
+but there is room for further improvement.
+
+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
+
+<p>
+It is possible to use tracing on RCU code, but tracing itself
+uses RCU.
+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
+is provided for use by tracing, which avoids the destructive
+recursion that could otherwise ensue.
+This API is also used by virtualization in some architectures,
+where RCU readers execute in environments in which tracing
+cannot be used.
+The tracing folks both located the requirement and provided the
+needed fix, so this surprise requirement was relatively painless.
+
+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
+
+<p>
+Interrupting idle CPUs is considered socially unacceptable,
+especially by people with battery-powered embedded systems.
+RCU therefore conserves energy by detecting which CPUs are
+idle, including tracking CPUs that have been interrupted from idle.
+This is a large part of the energy-efficiency requirement,
+so I learned of this via an irate phone call.
+
+<p>
+Because RCU avoids interrupting idle CPUs, it is illegal to
+execute an RCU read-side critical section on an idle CPU.
+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
+if you try it.)
+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
+event tracing is provided to work around this restriction.
+In addition, <tt>rcu_is_watching()</tt> may be used to
+test whether or not it is currently legal to run RCU read-side
+critical sections on this CPU.
+I learned of the need for diagnostics on the one hand
+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
+idle-loop code.
+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
+which is used quite heavily in the idle loop.
+
+<p>
+It is similarly socially unacceptable to interrupt an
+<tt>nohz_full</tt> CPU running in userspace.
+RCU must therefore track <tt>nohz_full</tt> userspace
+execution.
+And in
+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
+kernels, RCU must separately track idle CPUs on the one hand and
+CPUs that are either idle or executing in userspace on the other.
+In both cases, RCU must be able to sample state at two points in
+time, and be able to determine whether or not some other CPU spent
+any time idle and/or executing in userspace.
+
+<p>
+These energy-efficiency requirements have proven quite difficult to
+understand and to meet, for example, there have been more than five
+clean-sheet rewrites of RCU's energy-efficiency code, the last of
+which was finally able to demonstrate
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
+As noted earlier,
+I learned of many of these requirements via angry phone calls:
+Flaming me on the Linux-kernel mailing list was apparently not
+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
+
+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
+
+<p>
+Although small-memory non-realtime systems can simply use Tiny RCU,
+code size is only one aspect of memory efficiency.
+Another aspect is the size of the <tt>rcu_head</tt> structure
+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
+Although this structure contains nothing more than a pair of pointers,
+it does appear in many RCU-protected data structures, including
+some that are size critical.
+The <tt>page</tt> structure is a case in point, as evidenced by
+the many occurrences of the <tt>union</tt> keyword within that structure.
+
+<p>
+This need for memory efficiency is one reason that RCU uses hand-crafted
+singly linked lists to track the <tt>rcu_head</tt> structures that
+are waiting for a grace period to elapse.
+It is also the reason why <tt>rcu_head</tt> structures do not contain
+debug information, such as fields tracking the file and line of the
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
+Although this information might appear in debug-only kernel builds at some
+point, in the meantime, the <tt>->func</tt> field will often provide
+the needed debug information.
+
+<p>
+However, in some cases, the need for memory efficiency leads to even
+more extreme measures.
+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
+shares storage with a great many other structures that are used at
+various points in the corresponding page's lifetime.
+In order to correctly resolve certain
+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
+the Linux kernel's memory-management subsystem needs a particular bit
+to remain zero during all phases of grace-period processing,
+and that bit happens to map to the bottom bit of the
+<tt>rcu_head</tt> structure's <tt>->next</tt> field.
+RCU makes this guarantee as long as <tt>call_rcu()</tt>
+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
+or some future “lazy”
+variant of <tt>call_rcu()</tt> that might one day be created for
+energy-efficiency purposes.
+
+<h3><a name="Performance, Scalability, Response Time, and Reliability">
+Performance, Scalability, Response Time, and Reliability</a></h3>
+
+<p>
+Expanding on the
+<a href="#Performance and Scalability">earlier discussion</a>,
+RCU is used heavily by hot code paths in performance-critical
+portions of the Linux kernel's networking, security, virtualization,
+and scheduling code paths.
+RCU must therefore use efficient implementations, especially in its
+read-side primitives.
+To that end, it would be good if preemptible RCU's implementation
+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
+this requires resolving <tt>#include</tt> issues with the
+<tt>task_struct</tt> structure.
+
+<p>
+The Linux kernel supports hardware configurations with up to
+4096 CPUs, which means that RCU must be extremely scalable.
+Algorithms that involve frequent acquisitions of global locks or
+frequent atomic operations on global variables simply cannot be
+tolerated within the RCU implementation.
+RCU therefore makes heavy use of a combining tree based on the
+<tt>rcu_node</tt> structure.
+RCU is required to tolerate all CPUs continuously invoking any
+combination of RCU's runtime primitives with minimal per-operation
+overhead.
+In fact, in many cases, increasing load must <i>decrease</i> the
+per-operation overhead, witness the batching optimizations for
+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
+As a general rule, RCU must cheerfully accept whatever the
+rest of the Linux kernel decides to throw at it.
+
+<p>
+The Linux kernel is used for real-time workloads, especially
+in conjunction with the
+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
+The real-time-latency response requirements are such that the
+traditional approach of disabling preemption across RCU
+read-side critical sections is inappropriate.
+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
+use an RCU implementation that allows RCU read-side critical
+sections to be preempted.
+This requirement made its presence known after users made it
+clear that an earlier
+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
+did not meet their needs, in conjunction with some
+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
+encountered by a very early version of the -rt patchset.
+
+<p>
+In addition, RCU must make do with a sub-100-microsecond real-time latency
+budget.
+In fact, on smaller systems with the -rt patchset, the Linux kernel
+provides sub-20-microsecond real-time latencies for the whole kernel,
+including RCU.
+RCU's scalability and latency must therefore be sufficient for
+these sorts of configurations.
+To my surprise, the sub-100-microsecond real-time latency budget
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
+applies to even the largest systems [PDF]</a>,
+up to and including systems with 4096 CPUs.
+This real-time requirement motivated the grace-period kthread, which
+also simplified handling of a number of race conditions.
+
+<p>
+Finally, RCU's status as a synchronization primitive means that
+any RCU failure can result in arbitrary memory corruption that can be
+extremely difficult to debug.
+This means that RCU must be extremely reliable, which in
+practice also means that RCU must have an aggressive stress-test
+suite.
+This stress-test suite is called <tt>rcutorture</tt>.
+
+<p>
+Although the need for <tt>rcutorture</tt> was no surprise,
+the current immense popularity of the Linux kernel is posing
+interesting—and perhaps unprecedented—validation
+challenges.
+To see this, keep in mind that there are well over one billion
+instances of the Linux kernel running today, given Android
+smartphones, Linux-powered televisions, and servers.
+This number can be expected to increase sharply with the advent of
+the celebrated Internet of Things.
+
+<p>
+Suppose that RCU contains a race condition that manifests on average
+once per million years of runtime.
+This bug will be occurring about three times per <i>day</i> across
+the installed base.
+RCU could simply hide behind hardware error rates, given that no one
+should really expect their smartphone to last for a million years.
+However, anyone taking too much comfort from this thought should
+consider the fact that in most jurisdictions, a successful multi-year
+test of a given mechanism, which might include a Linux kernel,
+suffices for a number of types of safety-critical certifications.
+In fact, rumor has it that the Linux kernel is already being used
+in production for safety-critical applications.
+I don't know about you, but I would feel quite bad if a bug in RCU
+killed someone.
+Which might explain my recent focus on validation and verification.
+
+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
+
+<p>
+One of the more surprising things about RCU is that there are now
+no fewer than five <i>flavors</i>, or API families.
+In addition, the primary flavor that has been the sole focus up to
+this point has two different implementations, non-preemptible and
+preemptible.
+The other four flavors are listed below, with requirements for each
+described in a separate section.
+
+<ol>
+<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
+<li> <a href="#Sched Flavor">Sched Flavor</a>
+<li> <a href="#Sleepable RCU">Sleepable RCU</a>
+<li> <a href="#Tasks RCU">Tasks RCU</a>
+</ol>
+
+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
+
+<p>
+The softirq-disable (AKA “bottom-half”,
+hence the “_bh” abbreviations)
+flavor of RCU, or <i>RCU-bh</i>, was developed by
+Dipankar Sarma to provide a flavor of RCU that could withstand the
+network-based denial-of-service attacks researched by Robert
+Olsson.
+These attacks placed so much networking load on the system
+that some of the CPUs never exited softirq execution,
+which in turn prevented those CPUs from ever executing a context switch,
+which, in the RCU implementation of that time, prevented grace periods
+from ever ending.
+The result was an out-of-memory condition and a system hang.
+
+<p>
+The solution was the creation of RCU-bh, which does
+<tt>local_bh_disable()</tt>
+across its read-side critical sections, and which uses the transition
+from one type of softirq processing to another as a quiescent state
+in addition to context switch, idle, user mode, and offline.
+This means that RCU-bh grace periods can complete even when some of
+the CPUs execute in softirq indefinitely, thus allowing algorithms
+based on RCU-bh to withstand network-based denial-of-service attacks.
+
+<p>
+Because
+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
+disable and re-enable softirq handlers, any attempt to start a softirq
+handlers during the
+RCU-bh read-side critical section will be deferred.
+In this case, <tt>rcu_read_unlock_bh()</tt>
+will invoke softirq processing, which can take considerable time.
+One can of course argue that this softirq overhead should be associated
+with the code following the RCU-bh read-side critical section rather
+than <tt>rcu_read_unlock_bh()</tt>, but the fact
+is that most profiling tools cannot be expected to make this sort
+of fine distinction.
+For example, suppose that a three-millisecond-long RCU-bh read-side
+critical section executes during a time of heavy networking load.
+There will very likely be an attempt to invoke at least one softirq
+handler during that three milliseconds, but any such invocation will
+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
+This can of course make it appear at first glance as if
+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
+includes
+<tt>rcu_read_lock_bh()</tt>,
+<tt>rcu_read_unlock_bh()</tt>,
+<tt>rcu_dereference_bh()</tt>,
+<tt>rcu_dereference_bh_check()</tt>,
+<tt>synchronize_rcu_bh()</tt>,
+<tt>synchronize_rcu_bh_expedited()</tt>,
+<tt>call_rcu_bh()</tt>,
+<tt>rcu_barrier_bh()</tt>, and
+<tt>rcu_read_lock_bh_held()</tt>.
+
+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
+
+<p>
+Before preemptible RCU, waiting for an RCU grace period had the
+side effect of also waiting for all pre-existing interrupt
+and NMI handlers.
+However, there are legitimate preemptible-RCU implementations that
+do not have this property, given that any point in the code outside
+of an RCU read-side critical section can be a quiescent state.
+Therefore, <i>RCU-sched</i> was created, which follows “classic”
+RCU in that an RCU-sched grace period waits for for pre-existing
+interrupt and NMI handlers.
+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
+APIs have identical implementations, while kernels built with
+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
+
+<p>
+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
+disable and re-enable preemption, respectively.
+This means that if there was a preemption attempt during the
+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
+will enter the scheduler, with all the latency and overhead entailed.
+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
+However, the highest-priority task won't be preempted, so that task
+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
+includes
+<tt>rcu_read_lock_sched()</tt>,
+<tt>rcu_read_unlock_sched()</tt>,
+<tt>rcu_read_lock_sched_notrace()</tt>,
+<tt>rcu_read_unlock_sched_notrace()</tt>,
+<tt>rcu_dereference_sched()</tt>,
+<tt>rcu_dereference_sched_check()</tt>,
+<tt>synchronize_sched()</tt>,
+<tt>synchronize_rcu_sched_expedited()</tt>,
+<tt>call_rcu_sched()</tt>,
+<tt>rcu_barrier_sched()</tt>, and
+<tt>rcu_read_lock_sched_held()</tt>.
+However, anything that disables preemption also marks an RCU-sched
+read-side critical section, including
+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
+and so on.
+
+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
+
+<p>
+For well over a decade, someone saying “I need to block within
+an RCU read-side critical section” was a reliable indication
+that this someone did not understand RCU.
+After all, if you are always blocking in an RCU read-side critical
+section, you can probably afford to use a higher-overhead synchronization
+mechanism.
+However, that changed with the advent of the Linux kernel's notifiers,
+whose RCU read-side critical
+sections almost never sleep, but sometimes need to.
+This resulted in the introduction of
+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
+or <i>SRCU</i>.
+
+<p>
+SRCU allows different domains to be defined, with each such domain
+defined by an instance of an <tt>srcu_struct</tt> structure.
+A pointer to this structure must be passed in to each SRCU function,
+for example, <tt>synchronize_srcu(&ss)</tt>, where
+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
+The key benefit of these domains is that a slow SRCU reader in one
+domain does not delay an SRCU grace period in some other domain.
+That said, one consequence of these domains is that read-side code
+must pass a “cookie” from <tt>srcu_read_lock()</tt>
+to <tt>srcu_read_unlock()</tt>, for example, as follows:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+</pre>
+</blockquote>
+
+<p>
+As noted above, it is legal to block within SRCU read-side critical sections,
+however, with great power comes great responsibility.
+If you block forever in one of a given domain's SRCU read-side critical
+sections, then that domain's grace periods will also be blocked forever.
+Of course, one good way to block forever is to deadlock, which can
+happen if any operation in a given domain's SRCU read-side critical
+section can block waiting, either directly or indirectly, for that domain's
+grace period to elapse.
+For example, this results in a self-deadlock:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+</pre>
+</blockquote>
+
+<p>
+However, if line 5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
+deadlock would still be possible.
+Furthermore, if line 5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
+and if an <tt>ss1</tt>-domain SRCU read-side critical section
+acquired another mutex that was held across as <tt>ss</tt>-domain
+<tt>synchronize_srcu()</tt>,
+deadlock would again be possible.
+Such a deadlock cycle could extend across an arbitrarily large number
+of different SRCU domains.
+Again, with great power comes great responsibility.
+
+<p>
+Unlike the other RCU flavors, SRCU read-side critical sections can
+run on idle and even offline CPUs.
+This ability requires that <tt>srcu_read_lock()</tt> and
+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
+that SRCU readers will run a bit slower than would RCU readers.
+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
+API, which, in combination with <tt>srcu_read_unlock()</tt>,
+guarantees a full memory barrier.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
+includes
+<tt>srcu_read_lock()</tt>,
+<tt>srcu_read_unlock()</tt>,
+<tt>srcu_dereference()</tt>,
+<tt>srcu_dereference_check()</tt>,
+<tt>synchronize_srcu()</tt>,
+<tt>synchronize_srcu_expedited()</tt>,
+<tt>call_srcu()</tt>,
+<tt>srcu_barrier()</tt>, and
+<tt>srcu_read_lock_held()</tt>.
+It also includes
+<tt>DEFINE_SRCU()</tt>,
+<tt>DEFINE_STATIC_SRCU()</tt>, and
+<tt>init_srcu_struct()</tt>
+APIs for defining and initializing <tt>srcu_struct</tt> structures.
+
+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
+
+<p>
+Some forms of tracing use “tramopolines” to handle the
+binary rewriting required to install different types of probes.
+It would be good to be able to free old trampolines, which sounds
+like a job for some form of RCU.
+However, because it is necessary to be able to install a trace
+anywhere in the code, it is not possible to use read-side markers
+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+In addition, it does not work to have these markers in the trampoline
+itself, because there would need to be instructions following
+<tt>rcu_read_unlock()</tt>.
+Although <tt>synchronize_rcu()</tt> would guarantee that execution
+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
+guarantee that execution had completely left the trampoline.
+
+<p>
+The solution, in the form of
+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
+is to have implicit
+read-side critical sections that are delimited by voluntary context
+switches, that is, calls to <tt>schedule()</tt>,
+<tt>cond_resched_rcu_qs()</tt>, and
+<tt>synchronize_rcu_tasks()</tt>.
+In addition, transitions to and from userspace execution also delimit
+tasks-RCU read-side critical sections.
+
+<p>
+The tasks-RCU API is quite compact, consisting only of
+<tt>call_rcu_tasks()</tt>,
+<tt>synchronize_rcu_tasks()</tt>, and
+<tt>rcu_barrier_tasks()</tt>.
+
+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
+
+<p>
+One of the tricks that RCU uses to attain update-side scalability is
+to increase grace-period latency with increasing numbers of CPUs.
+If this becomes a serious problem, it will be necessary to rework the
+grace-period state machine so as to avoid the need for the additional
+latency.
+
+<p>
+Expedited grace periods scan the CPUs, so their latency and overhead
+increases with increasing numbers of CPUs.
+If this becomes a serious problem on large systems, it will be necessary
+to do some redesign to avoid this scalability problem.
+
+<p>
+RCU disables CPU hotplug in a few places, perhaps most notably in the
+expedited grace-period and <tt>rcu_barrier()</tt> operations.
+If there is a strong reason to use expedited grace periods in CPU-hotplug
+notifiers, it will be necessary to avoid disabling CPU hotplug.
+This would introduce some complexity, so there had better be a <i>very</i>
+good reason.
+
+<p>
+The tradeoff between grace-period latency on the one hand and interruptions
+of other CPUs on the other hand may need to be re-examined.
+The desire is of course for zero grace-period latency as well as zero
+interprocessor interrupts undertaken during an expedited grace period
+operation.
+While this ideal is unlikely to be achievable, it is quite possible that
+further improvements can be made.
+
+<p>
+The multiprocessor implementations of RCU use a combining tree that
+groups CPUs so as to reduce lock contention and increase cache locality.
+However, this combining tree does not spread its memory across NUMA
+nodes nor does it align the CPU groups with hardware features such
+as sockets or cores.
+Such spreading and alignment is currently believed to be unnecessary
+because the hotpath read-side primitives do not access the combining
+tree, nor does <tt>call_rcu()</tt> in the common case.
+If you believe that your architecture needs such spreading and alignment,
+then your architecture should also benefit from the
+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
+to the number of CPUs in a socket, NUMA node, or whatever.
+If the number of CPUs is too large, use a fraction of the number of
+CPUs.
+If the number of CPUs is a large prime number, well, that certainly
+is an “interesting” architectural choice!
+More flexible arrangements might be considered, but only if
+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
+if the inadequacy has been demonstrated by a carefully run and
+realistic system-level workload.
+
+<p>
+Please note that arrangements that require RCU to remap CPU numbers will
+require extremely good demonstration of need and full exploration of
+alternatives.
+
+<p>
+There is an embarrassingly large number of flavors of RCU, and this
+number has been increasing over time.
+Perhaps it will be possible to combine some at some future date.
+
+<p>
+RCU's various kthreads are reasonably recent additions.
+It is quite likely that adjustments will be required to more gracefully
+handle extreme loads.
+It might also be necessary to be able to relate CPU utilization by
+RCU's kthreads and softirq handlers to the code that instigated this
+CPU utilization.
+For example, RCU callback overhead might be charged back to the
+originating <tt>call_rcu()</tt> instance, though probably not
+in production kernels.
+
+<h2><a name="Summary">Summary</a></h2>
+
+<p>
+This document has presented more than two decade's worth of RCU
+requirements.
+Given that the requirements keep changing, this will not be the last
+word on this subject, but at least it serves to get an important
+subset of the requirements set forth.
+
+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
+
+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
+Andy Lutomirski for their help in rendering
+this article human readable, and to Michelle Rankin for her support
+of this effort.
+Other contributions are acknowledged in the Linux kernel's git archive.
+The cartoon is copyright (c) 2013 by Melissa Broussard,
+and is provided
+under the terms of the Creative Commons Attribution-Share Alike 3.0
+United States license.
+
+<h3><a name="Answers to Quick Quizzes">
+Answers to Quick Quizzes</a></h3>
+
+<a name="qq1answer"></a>
+<p><b>Quick Quiz 1</b>:
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+
+
+</p><p><b>Answer</b>:
+First, if updaters do not wish to be blocked by readers, they can use
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+be discussed later.
+Second, even when using <tt>synchronize_rcu()</tt>, the other
+update-side code does run concurrently with readers, whether pre-existing
+or not.
+
+
+</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
+
+<a name="qq2answer"></a>
+<p><b>Quick Quiz 2</b>:
+Why is the <tt>synchronize_rcu()</tt> on line 28 needed?
+
+
+</p><p><b>Answer</b>:
+Without that extra grace period, memory reordering could result in
+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+concurrently with the last bits of <tt>recovery()</tt>.
+
+
+</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
+
+<a name="qq3answer"></a>
+<p><b>Quick Quiz 3</b>:
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p->a</tt> and <tt>p->b</tt>
+from being reordered.
+Can't that also cause problems?
+
+
+</p><p><b>Answer</b>:
+No, it cannot.
+The readers cannot see either of these two fields until
+the assignment to <tt>gp</tt>, by which time both fields are
+fully initialized.
+So reordering the assignments
+to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly
+cause any problems.
+
+
+</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
+
+<a name="qq4answer"></a>
+<p><b>Quick Quiz 4</b>:
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+
+
+</p><p><b>Answer</b>:
+Let's start with what happens to <tt>do_something_gp()</tt>
+if it fails to use <tt>rcu_dereference()</tt>.
+It could reuse a value formerly fetched from this same pointer.
+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+mash-up of two distince pointer values.
+It might even use value-speculation optimizations, where it makes a wrong
+guess, but by the time it gets around to checking the value, an update
+has changed the pointer to match the wrong guess.
+Too bad about any dereferences that returned pre-initialization garbage
+in the meantime!
+
+<p>
+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+the above optimizations are harmless.
+However,
+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+<tt>sparse</tt> will complain if you
+define <tt>gp</tt> with <tt>__rcu</tt> and then
+access it without using
+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+
+
+</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
+
+<a name="qq5answer"></a>
+<p><b>Quick Quiz 5</b>:
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+
+
+</p><p><b>Answer</b>:
+If RCU cannot tell whether or not a given
+RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>,
+then it must assume that the RCU read-side critical section
+started first.
+In other words, a given instance of <tt>synchronize_rcu()</tt>
+can avoid waiting on a given RCU read-side critical section only
+if it can prove that <tt>synchronize_rcu()</tt> started first.
+
+
+</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
+
+<a name="qq6answer"></a>
+<p><b>Quick Quiz 6</b>:
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+
+
+</p><p><b>Answer</b>:
+Yes, they really are required.
+To see why the first guarantee is required, consider the following
+sequence of events:
+
+<ol>
+<li> CPU 1: <tt>rcu_read_lock()</tt>
+<li> CPU 1: <tt>q = rcu_dereference(gp);
+ /* Very likely to return p. */</tt>
+<li> CPU 0: <tt>list_del_rcu(p);</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li> CPU 1: <tt>do_something_with(q->a);
+ /* No smp_mb(), so might happen after kfree(). */</tt>
+<li> CPU 1: <tt>rcu_read_unlock()</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li> CPU 0: <tt>kfree(p);</tt>
+</ol>
+
+<p>
+Therefore, there absolutely must be a full memory barrier between the
+end of the RCU read-side critical section and the end of the
+grace period.
+
+<p>
+The sequence of events demonstrating the necessity of the second rule
+is roughly similar:
+
+<ol>
+<li> CPU 0: <tt>list_del_rcu(p);</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li> CPU 1: <tt>rcu_read_lock()</tt>
+<li> CPU 1: <tt>q = rcu_dereference(gp);
+ /* Might return p if no memory barrier. */</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li> CPU 0: <tt>kfree(p);</tt>
+<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt>
+<li> CPU 1: <tt>rcu_read_unlock()</tt>
+</ol>
+
+<p>
+And similarly, without a memory barrier between the beginning of the
+grace period and the beginning of the RCU read-side critical section,
+CPU 1 might end up accessing the freelist.
+
+<p>
+The “as if” rule of course applies, so that any implementation
+that acts as if the appropriate memory barriers were in place is a
+correct implementation.
+That said, it is much easier to fool yourself into believing that you have
+adhered to the as-if rule than it is to actually adhere to it!
+
+
+</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
+
+<a name="qq7answer"></a>
+<p><b>Quick Quiz 7</b>:
+But how does the upgrade-to-write operation exclude other readers?
+
+
+</p><p><b>Answer</b>:
+It doesn't, just like normal RCU updates, which also do not exclude
+RCU readers.
+
+
+</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
+
+<a name="qq8answer"></a>
+<p><b>Quick Quiz 8</b>:
+Can't the compiler also reorder this code?
+
+
+</p><p><b>Answer</b>:
+No, the volatile casts in <tt>READ_ONCE()</tt> and
+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+this particular case.
+
+
+</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
+
+<a name="qq9answer"></a>
+<p><b>Quick Quiz 9</b>:
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+
+
+</p><p><b>Answer</b>:
+No.
+Even if <tt>synchronize_rcu()</tt> were to wait until
+all readers had completed, a new reader might start immediately after
+<tt>synchronize_rcu()</tt> completed.
+Therefore, the code following
+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
+in any case.
+
+
+</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
+
+<a name="qq10answer"></a>
+<p><b>Quick Quiz 10</b>:
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+
+
+</p><p><b>Answer</b>:
+In theory, an infinite number.
+In practice, an unknown number that is sensitive to both implementation
+details and timing considerations.
+Therefore, even in practice, RCU users must abide by the theoretical rather
+than the practical answer.
+
+
+</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
+
+<a name="qq11answer"></a>
+<p><b>Quick Quiz 11</b>:
+What about sleeping locks?
+
+
+</p><p><b>Answer</b>:
+These are forbidden within Linux-kernel RCU read-side critical sections
+because it is not legal to place a quiescent state (in this case,
+voluntary context switch) within an RCU read-side critical section.
+However, sleeping locks may be used within userspace RCU read-side critical
+sections, and also within Linux-kernel sleepable RCU
+<a href="#Sleepable RCU">(SRCU)</a>
+read-side critical sections.
+In addition, the -rt patchset turns spinlocks into a sleeping locks so
+that the corresponding critical sections can be preempted, which
+also means that these sleeplockified spinlocks (but not other sleeping locks!)
+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
+
+<p>
+Note that it <i>is</i> legal for a normal RCU read-side critical section
+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
+but only as long as it does not loop indefinitely attempting to
+conditionally acquire that sleeping locks.
+The key point is that things like <tt>mutex_trylock()</tt>
+either return with the mutex held, or return an error indication if
+the mutex was not immediately available.
+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
+
+
+</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
+
+<a name="qq12answer"></a>
+<p><b>Quick Quiz 12</b>:
+Why does line 19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line 25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+
+
+</p><p><b>Answer</b>:
+Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes
+any changes, including any insertions that <tt>rcu_dereference()</tt>
+would protect against.
+Therefore, any insertions will be delayed until after <tt>->gp_lock</tt>
+is released on line 25, which in turn means that
+<tt>rcu_access_pointer()</tt> suffices.
+
+
+</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
+
+<a name="qq13answer"></a>
+<p><b>Quick Quiz 13</b>:
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+
+
+</p><p><b>Answer</b>:
+We could define things this way, but keep in mind that this sort of
+definition would say that updates in garbage-collected languages
+cannot complete until the next time the garbage collector runs,
+which does not seem at all reasonable.
+The key point is that in most cases, an updater using either
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+next update as soon as it has invoked <tt>call_rcu()</tt> or
+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
+grace period.
+
+
+</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
+
+<a name="qq14answer"></a>
+<p><b>Quick Quiz 14</b>:
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+
+
+</p><p><b>Answer</b>:
+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+maps directly to <tt>synchronize_sched()</tt>.
+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+during scheduler initialization.
+
+
+</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
+
+
+</body></html>
--- /dev/null
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
+ "http://www.w3.org/TR/html4/loose.dtd">
+ <html>
+ <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
+ <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
+
+<h1>A Tour Through RCU's Requirements</h1>
+
+<p>Copyright IBM Corporation, 2015</p>
+<p>Author: Paul E. McKenney</p>
+<p><i>The initial version of this document appeared in the
+<a href="https://lwn.net/">LWN</a> articles
+<a href="https://lwn.net/Articles/652156/">here</a>,
+<a href="https://lwn.net/Articles/652677/">here</a>, and
+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
+
+<h2>Introduction</h2>
+
+<p>
+Read-copy update (RCU) is a synchronization mechanism that is often
+used as a replacement for reader-writer locking.
+RCU is unusual in that updaters do not block readers,
+which means that RCU's read-side primitives can be exceedingly fast
+and scalable.
+In addition, updaters can make useful forward progress concurrently
+with readers.
+However, all this concurrency between RCU readers and updaters does raise
+the question of exactly what RCU readers are doing, which in turn
+raises the question of exactly what RCU's requirements are.
+
+<p>
+This document therefore summarizes RCU's requirements, and can be thought
+of as an informal, high-level specification for RCU.
+It is important to understand that RCU's specification is primarily
+empirical in nature;
+in fact, I learned about many of these requirements the hard way.
+This situation might cause some consternation, however, not only
+has this learning process been a lot of fun, but it has also been
+a great privilege to work with so many people willing to apply
+technologies in interesting new ways.
+
+<p>
+All that aside, here are the categories of currently known RCU requirements:
+</p>
+
+<ol>
+<li> <a href="#Fundamental Requirements">
+ Fundamental Requirements</a>
+<li> <a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
+<li> <a href="#Parallelism Facts of Life">
+ Parallelism Facts of Life</a>
+<li> <a href="#Quality-of-Implementation Requirements">
+ Quality-of-Implementation Requirements</a>
+<li> <a href="#Linux Kernel Complications">
+ Linux Kernel Complications</a>
+<li> <a href="#Software-Engineering Requirements">
+ Software-Engineering Requirements</a>
+<li> <a href="#Other RCU Flavors">
+ Other RCU Flavors</a>
+<li> <a href="#Possible Future Changes">
+ Possible Future Changes</a>
+</ol>
+
+<p>
+This is followed by a <a href="#Summary">summary</a>,
+which is in turn followed by the inevitable
+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
+
+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
+
+<p>
+RCU's fundamental requirements are the closest thing RCU has to hard
+mathematical requirements.
+These are:
+
+<ol>
+<li> <a href="#Grace-Period Guarantee">
+ Grace-Period Guarantee</a>
+<li> <a href="#Publish-Subscribe Guarantee">
+ Publish-Subscribe Guarantee</a>
+<li> <a href="#Memory-Barrier Guarantees">
+ Memory-Barrier Guarantees</a>
+<li> <a href="#RCU Primitives Guaranteed to Execute Unconditionally">
+ RCU Primitives Guaranteed to Execute Unconditionally</a>
+<li> <a href="#Guaranteed Read-to-Write Upgrade">
+ Guaranteed Read-to-Write Upgrade</a>
+</ol>
+
+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
+
+<p>
+RCU's grace-period guarantee is unusual in being premeditated:
+Jack Slingwine and I had this guarantee firmly in mind when we started
+work on RCU (then called “rclock”) in the early 1990s.
+That said, the past two decades of experience with RCU have produced
+a much more detailed understanding of this guarantee.
+
+<p>
+RCU's grace-period guarantee allows updaters to wait for the completion
+of all pre-existing RCU read-side critical sections.
+An RCU read-side critical section
+begins with the marker <tt>rcu_read_lock()</tt> and ends with
+the marker <tt>rcu_read_unlock()</tt>.
+These markers may be nested, and RCU treats a nested set as one
+big RCU read-side critical section.
+Production-quality implementations of <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
+fact have exactly zero overhead in Linux kernels built for production
+use with <tt>CONFIG_PREEMPT=n</tt>.
+
+<p>
+This guarantee allows ordering to be enforced with extremely low
+overhead to readers, for example:
+
+<blockquote>
+<pre>
+ 1 int x, y;
+ 2
+ 3 void thread0(void)
+ 4 {
+ 5 rcu_read_lock();
+ 6 r1 = READ_ONCE(x);
+ 7 r2 = READ_ONCE(y);
+ 8 rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13 WRITE_ONCE(x, 1);
+14 synchronize_rcu();
+15 WRITE_ONCE(y, 1);
+16 }
+</pre>
+</blockquote>
+
+<p>
+Because the <tt>synchronize_rcu()</tt> on line 14 waits for
+all pre-existing readers, any instance of <tt>thread0()</tt> that
+loads a value of zero from <tt>x</tt> must complete before
+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
+also load a value of zero from <tt>y</tt>.
+Similarly, any instance of <tt>thread0()</tt> that loads a value of
+one from <tt>y</tt> must have started after the
+<tt>synchronize_rcu()</tt> started, and must therefore also load
+a value of one from <tt>x</tt>.
+Therefore, the outcome:
+<blockquote>
+<pre>
+(r1 == 0 && r2 == 1)
+</pre>
+</blockquote>
+cannot happen.
+
+<p>@@QQ@@
+Wait a minute!
+You said that updaters can make useful forward progress concurrently
+with readers, but pre-existing readers will block
+<tt>synchronize_rcu()</tt>!!!
+Just who are you trying to fool???
+<p>@@QQA@@
+First, if updaters do not wish to be blocked by readers, they can use
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
+be discussed later.
+Second, even when using <tt>synchronize_rcu()</tt>, the other
+update-side code does run concurrently with readers, whether pre-existing
+or not.
+<p>@@QQE@@
+
+<p>
+This scenario resembles one of the first uses of RCU in
+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
+which managed a distributed lock manager's transition into
+a state suitable for handling recovery from node failure,
+more or less as follows:
+
+<blockquote>
+<pre>
+ 1 #define STATE_NORMAL 0
+ 2 #define STATE_WANT_RECOVERY 1
+ 3 #define STATE_RECOVERING 2
+ 4 #define STATE_WANT_NORMAL 3
+ 5
+ 6 int state = STATE_NORMAL;
+ 7
+ 8 void do_something_dlm(void)
+ 9 {
+10 int state_snap;
+11
+12 rcu_read_lock();
+13 state_snap = READ_ONCE(state);
+14 if (state_snap == STATE_NORMAL)
+15 do_something();
+16 else
+17 do_something_carefully();
+18 rcu_read_unlock();
+19 }
+20
+21 void start_recovery(void)
+22 {
+23 WRITE_ONCE(state, STATE_WANT_RECOVERY);
+24 synchronize_rcu();
+25 WRITE_ONCE(state, STATE_RECOVERING);
+26 recovery();
+27 WRITE_ONCE(state, STATE_WANT_NORMAL);
+28 synchronize_rcu();
+29 WRITE_ONCE(state, STATE_NORMAL);
+30 }
+</pre>
+</blockquote>
+
+<p>
+The RCU read-side critical section in <tt>do_something_dlm()</tt>
+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
+to guarantee that <tt>do_something()</tt> never runs concurrently
+with <tt>recovery()</tt>, but with little or no synchronization
+overhead in <tt>do_something_dlm()</tt>.
+
+<p>@@QQ@@
+Why is the <tt>synchronize_rcu()</tt> on line 28 needed?
+<p>@@QQA@@
+Without that extra grace period, memory reordering could result in
+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
+concurrently with the last bits of <tt>recovery()</tt>.
+<p>@@QQE@@
+
+<p>
+In order to avoid fatal problems such as deadlocks,
+an RCU read-side critical section must not contain calls to
+<tt>synchronize_rcu()</tt>.
+Similarly, an RCU read-side critical section must not
+contain anything that waits, directly or indirectly, on completion of
+an invocation of <tt>synchronize_rcu()</tt>.
+
+<p>
+Although RCU's grace-period guarantee is useful in and of itself, with
+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
+it would be good to be able to use RCU to coordinate read-side
+access to linked data structures.
+For this, the grace-period guarantee is not sufficient, as can
+be seen in function <tt>add_gp_buggy()</tt> below.
+We will look at the reader's code later, but in the meantime, just think of
+the reader as locklessly picking up the <tt>gp</tt> pointer,
+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
+<tt>->a</tt> and <tt>->b</tt> fields.
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 p->a = a;
+12 p->b = a;
+13 gp = p; /* ORDERING BUG */
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The problem is that both the compiler and weakly ordered CPUs are within
+their rights to reorder this code as follows:
+
+<blockquote>
+<pre>
+ 1 bool add_gp_buggy_optimized(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+<b>11 gp = p; /* ORDERING BUG */
+12 p->a = a;
+13 p->b = a;</b>
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+If an RCU reader fetches <tt>gp</tt> just after
+<tt>add_gp_buggy_optimized</tt> executes line 11,
+it will see garbage in the <tt>->a</tt> and <tt>->b</tt>
+fields.
+And this is but one of many ways in which compiler and hardware optimizations
+could cause trouble.
+Therefore, we clearly need some way to prevent the compiler and the CPU from
+reordering in this manner, which brings us to the publish-subscribe
+guarantee discussed in the next section.
+
+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
+
+<p>
+RCU's publish-subscribe guarantee allows data to be inserted
+into a linked data structure without disrupting RCU readers.
+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
+new data, and readers use <tt>rcu_dereference()</tt> to
+access data, whether new or old.
+The following shows an example of insertion:
+
+<blockquote>
+<pre>
+ 1 bool add_gp(int a, int b)
+ 2 {
+ 3 p = kmalloc(sizeof(*p), GFP_KERNEL);
+ 4 if (!p)
+ 5 return -ENOMEM;
+ 6 spin_lock(&gp_lock);
+ 7 if (rcu_access_pointer(gp)) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 p->a = a;
+12 p->b = a;
+13 rcu_assign_pointer(gp, p);
+14 spin_unlock(&gp_lock);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_assign_pointer()</tt> on line 13 is conceptually
+equivalent to a simple assignment statement, but also guarantees
+that its assignment will
+happen after the two assignments in lines 11 and 12,
+similar to the C11 <tt>memory_order_release</tt> store operation.
+It also prevents any number of “interesting” compiler
+optimizations, for example, the use of <tt>gp</tt> as a scratch
+location immediately preceding the assignment.
+
+<p>@@QQ@@
+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
+two assignments to <tt>p->a</tt> and <tt>p->b</tt>
+from being reordered.
+Can't that also cause problems?
+<p>@@QQA@@
+No, it cannot.
+The readers cannot see either of these two fields until
+the assignment to <tt>gp</tt>, by which time both fields are
+fully initialized.
+So reordering the assignments
+to <tt>p->a</tt> and <tt>p->b</tt> cannot possibly
+cause any problems.
+<p>@@QQE@@
+
+<p>
+It is tempting to assume that the reader need not do anything special
+to control its accesses to the RCU-protected data,
+as shown in <tt>do_something_gp_buggy()</tt> below:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 p = gp; /* OPTIMIZATIONS GALORE!!! */
+ 5 if (p) {
+ 6 do_something(p->a, p->b);
+ 7 rcu_read_unlock();
+ 8 return true;
+ 9 }
+10 rcu_read_unlock();
+11 return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+However, this temptation must be resisted because there are a
+surprisingly large number of ways that the compiler
+(to say nothing of
+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
+can trip this code up.
+For but one example, if the compiler were short of registers, it
+might choose to refetch from <tt>gp</tt> rather than keeping
+a separate copy in <tt>p</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp_buggy_optimized(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 if (gp) { /* OPTIMIZATIONS GALORE!!! */
+<b> 5 do_something(gp->a, gp->b);</b>
+ 6 rcu_read_unlock();
+ 7 return true;
+ 8 }
+ 9 rcu_read_unlock();
+10 return false;
+11 }
+</pre>
+</blockquote>
+
+<p>
+If this function ran concurrently with a series of updates that
+replaced the current structure with a new one,
+the fetches of <tt>gp->a</tt>
+and <tt>gp->b</tt> might well come from two different structures,
+which could cause serious confusion.
+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
+
+<blockquote>
+<pre>
+ 1 bool do_something_gp(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 p = rcu_dereference(gp);
+ 5 if (p) {
+ 6 do_something(p->a, p->b);
+ 7 rcu_read_unlock();
+ 8 return true;
+ 9 }
+10 rcu_read_unlock();
+11 return false;
+12 }
+</pre>
+</blockquote>
+
+<p>
+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
+memory barriers in the Linux kernel.
+Should a
+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
+ever appear, then <tt>rcu_dereference()</tt> could be implemented
+as a <tt>memory_order_consume</tt> load.
+Regardless of the exact implementation, a pointer fetched by
+<tt>rcu_dereference()</tt> may not be used outside of the
+outermost RCU read-side critical section containing that
+<tt>rcu_dereference()</tt>, unless protection of
+the corresponding data element has been passed from RCU to some
+other synchronization mechanism, most commonly locking or
+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
+
+<p>
+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
+use <tt>rcu_dereference()</tt>, and these two RCU API elements
+work together to ensure that readers have a consistent view of
+newly added data elements.
+
+<p>
+Of course, it is also necessary to remove elements from RCU-protected
+data structures, for example, using the following process:
+
+<ol>
+<li> Remove the data element from the enclosing structure.
+<li> Wait for all pre-existing RCU read-side critical sections
+ to complete (because only pre-existing readers can possibly have
+ a reference to the newly removed data element).
+<li> At this point, only the updater has a reference to the
+ newly removed data element, so it can safely reclaim
+ the data element, for example, by passing it to <tt>kfree()</tt>.
+</ol>
+
+This process is implemented by <tt>remove_gp_synchronous()</tt>:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_synchronous(void)
+ 2 {
+ 3 struct foo *p;
+ 4
+ 5 spin_lock(&gp_lock);
+ 6 p = rcu_access_pointer(gp);
+ 7 if (!p) {
+ 8 spin_unlock(&gp_lock);
+ 9 return false;
+10 }
+11 rcu_assign_pointer(gp, NULL);
+12 spin_unlock(&gp_lock);
+13 synchronize_rcu();
+14 kfree(p);
+15 return true;
+16 }
+</pre>
+</blockquote>
+
+<p>
+This function is straightforward, with line 13 waiting for a grace
+period before line 14 frees the old data element.
+This waiting ensures that readers will reach line 7 of
+<tt>do_something_gp()</tt> before the data element referenced by
+<tt>p</tt> is freed.
+The <tt>rcu_access_pointer()</tt> on line 6 is similar to
+<tt>rcu_dereference()</tt>, except that:
+
+<ol>
+<li> The value returned by <tt>rcu_access_pointer()</tt>
+ cannot be dereferenced.
+ If you want to access the value pointed to as well as
+ the pointer itself, use <tt>rcu_dereference()</tt>
+ instead of <tt>rcu_access_pointer()</tt>.
+<li> The call to <tt>rcu_access_pointer()</tt> need not be
+ protected.
+ In contrast, <tt>rcu_dereference()</tt> must either be
+ within an RCU read-side critical section or in a code
+ segment where the pointer cannot change, for example, in
+ code protected by the corresponding update-side lock.
+</ol>
+
+<p>@@QQ@@
+Without the <tt>rcu_dereference()</tt> or the
+<tt>rcu_access_pointer()</tt>, what destructive optimizations
+might the compiler make use of?
+<p>@@QQA@@
+Let's start with what happens to <tt>do_something_gp()</tt>
+if it fails to use <tt>rcu_dereference()</tt>.
+It could reuse a value formerly fetched from this same pointer.
+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
+mash-up of two distince pointer values.
+It might even use value-speculation optimizations, where it makes a wrong
+guess, but by the time it gets around to checking the value, an update
+has changed the pointer to match the wrong guess.
+Too bad about any dereferences that returned pre-initialization garbage
+in the meantime!
+
+<p>
+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
+the above optimizations are harmless.
+However,
+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
+<tt>sparse</tt> will complain if you
+define <tt>gp</tt> with <tt>__rcu</tt> and then
+access it without using
+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
+<p>@@QQE@@
+
+<p>
+In short, RCU's publish-subscribe guarantee is provided by the combination
+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
+This guarantee allows data elements to be safely added to RCU-protected
+linked data structures without disrupting RCU readers.
+This guarantee can be used in combination with the grace-period
+guarantee to also allow data elements to be removed from RCU-protected
+linked data structures, again without disrupting RCU readers.
+
+<p>
+This guarantee was only partially premeditated.
+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
+have anything resembling the <tt>smp_read_barrier_depends()</tt>
+that was later subsumed into <tt>rcu_dereference()</tt>.
+The need for these operations made itself known quite suddenly at a
+late-1990s meeting with the DEC Alpha architects, back in the days when
+DEC was still a free-standing company.
+It took the Alpha architects a good hour to convince me that any sort
+of barrier would ever be needed, and it then took me a good <i>two</i> hours
+to convince them that their documentation did not make this point clear.
+More recent work with the C and C++ standards committees have provided
+much education on tricks and traps from the compiler.
+In short, compilers were much less tricky in the early 1990s, but in
+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
+
+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
+
+<p>
+The previous section's simple linked-data-structure scenario clearly
+demonstrates the need for RCU's stringent memory-ordering guarantees on
+systems with more than one CPU:
+
+<ol>
+<li> Each CPU that has an RCU read-side critical section that
+ begins before <tt>synchronize_rcu()</tt> starts is
+ guaranteed to execute a full memory barrier between the time
+ that the RCU read-side critical section ends and the time that
+ <tt>synchronize_rcu()</tt> returns.
+ Without this guarantee, a pre-existing RCU read-side critical section
+ might hold a reference to the newly removed <tt>struct foo</tt>
+ after the <tt>kfree()</tt> on line 14 of
+ <tt>remove_gp_synchronous()</tt>.
+<li> Each CPU that has an RCU read-side critical section that ends
+ after <tt>synchronize_rcu()</tt> returns is guaranteed
+ to execute a full memory barrier between the time that
+ <tt>synchronize_rcu()</tt> begins and the time that the RCU
+ read-side critical section begins.
+ Without this guarantee, a later RCU read-side critical section
+ running after the <tt>kfree()</tt> on line 14 of
+ <tt>remove_gp_synchronous()</tt> might
+ later run <tt>do_something_gp()</tt> and find the
+ newly deleted <tt>struct foo</tt>.
+<li> If the task invoking <tt>synchronize_rcu()</tt> remains
+ on a given CPU, then that CPU is guaranteed to execute a full
+ memory barrier sometime during the execution of
+ <tt>synchronize_rcu()</tt>.
+ This guarantee ensures that the <tt>kfree()</tt> on
+ line 14 of <tt>remove_gp_synchronous()</tt> really does
+ execute after the removal on line 11.
+<li> If the task invoking <tt>synchronize_rcu()</tt> migrates
+ among a group of CPUs during that invocation, then each of the
+ CPUs in that group is guaranteed to execute a full memory barrier
+ sometime during the execution of <tt>synchronize_rcu()</tt>.
+ This guarantee also ensures that the <tt>kfree()</tt> on
+ line 14 of <tt>remove_gp_synchronous()</tt> really does
+ execute after the removal on
+ line 11, but also in the case where the thread executing the
+ <tt>synchronize_rcu()</tt> migrates in the meantime.
+</ol>
+
+<p>@@QQ@@
+Given that multiple CPUs can start RCU read-side critical sections
+at any time without any ordering whatsoever, how can RCU possibly tell whether
+or not a given RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>?
+<p>@@QQA@@
+If RCU cannot tell whether or not a given
+RCU read-side critical section starts before a
+given instance of <tt>synchronize_rcu()</tt>,
+then it must assume that the RCU read-side critical section
+started first.
+In other words, a given instance of <tt>synchronize_rcu()</tt>
+can avoid waiting on a given RCU read-side critical section only
+if it can prove that <tt>synchronize_rcu()</tt> started first.
+<p>@@QQE@@
+
+<p>@@QQ@@
+The first and second guarantees require unbelievably strict ordering!
+Are all these memory barriers <i> really</i> required?
+<p>@@QQA@@
+Yes, they really are required.
+To see why the first guarantee is required, consider the following
+sequence of events:
+
+<ol>
+<li> CPU 1: <tt>rcu_read_lock()</tt>
+<li> CPU 1: <tt>q = rcu_dereference(gp);
+ /* Very likely to return p. */</tt>
+<li> CPU 0: <tt>list_del_rcu(p);</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li> CPU 1: <tt>do_something_with(q->a);
+ /* No smp_mb(), so might happen after kfree(). */</tt>
+<li> CPU 1: <tt>rcu_read_unlock()</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li> CPU 0: <tt>kfree(p);</tt>
+</ol>
+
+<p>
+Therefore, there absolutely must be a full memory barrier between the
+end of the RCU read-side critical section and the end of the
+grace period.
+
+<p>
+The sequence of events demonstrating the necessity of the second rule
+is roughly similar:
+
+<ol>
+<li> CPU 0: <tt>list_del_rcu(p);</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> starts.
+<li> CPU 1: <tt>rcu_read_lock()</tt>
+<li> CPU 1: <tt>q = rcu_dereference(gp);
+ /* Might return p if no memory barrier. */</tt>
+<li> CPU 0: <tt>synchronize_rcu()</tt> returns.
+<li> CPU 0: <tt>kfree(p);</tt>
+<li> CPU 1: <tt>do_something_with(q->a); /* Boom!!! */</tt>
+<li> CPU 1: <tt>rcu_read_unlock()</tt>
+</ol>
+
+<p>
+And similarly, without a memory barrier between the beginning of the
+grace period and the beginning of the RCU read-side critical section,
+CPU 1 might end up accessing the freelist.
+
+<p>
+The “as if” rule of course applies, so that any implementation
+that acts as if the appropriate memory barriers were in place is a
+correct implementation.
+That said, it is much easier to fool yourself into believing that you have
+adhered to the as-if rule than it is to actually adhere to it!
+<p>@@QQE@@
+
+<p>
+Note that these memory-barrier requirements do not replace the fundamental
+RCU requirement that a grace period wait for all pre-existing readers.
+On the contrary, the memory barriers called out in this section must operate in
+such a way as to <i>enforce</i> this fundamental requirement.
+Of course, different implementations enforce this requirement in different
+ways, but enforce it they must.
+
+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
+
+<p>
+The common-case RCU primitives are unconditional.
+They are invoked, they do their job, and they return, with no possibility
+of error, and no need to retry.
+This is a key RCU design philosophy.
+
+<p>
+However, this philosophy is pragmatic rather than pigheaded.
+If someone comes up with a good justification for a particular conditional
+RCU primitive, it might well be implemented and added.
+After all, this guarantee was reverse-engineered, not premeditated.
+The unconditional nature of the RCU primitives was initially an
+accident of implementation, and later experience with synchronization
+primitives with conditional primitives caused me to elevate this
+accident to a guarantee.
+Therefore, the justification for adding a conditional primitive to
+RCU would need to be based on detailed and compelling use cases.
+
+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
+
+<p>
+As far as RCU is concerned, it is always possible to carry out an
+update within an RCU read-side critical section.
+For example, that RCU read-side critical section might search for
+a given data element, and then might acquire the update-side
+spinlock in order to update that element, all while remaining
+in that RCU read-side critical section.
+Of course, it is necessary to exit the RCU read-side critical section
+before invoking <tt>synchronize_rcu()</tt>, however, this
+inconvenience can be avoided through use of the
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
+described later in this document.
+
+<p>@@QQ@@
+But how does the upgrade-to-write operation exclude other readers?
+<p>@@QQA@@
+It doesn't, just like normal RCU updates, which also do not exclude
+RCU readers.
+<p>@@QQE@@
+
+<p>
+This guarantee allows lookup code to be shared between read-side
+and update-side code, and was premeditated, appearing in the earliest
+DYNIX/ptx RCU documentation.
+
+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
+
+<p>
+RCU provides extremely lightweight readers, and its read-side guarantees,
+though quite useful, are correspondingly lightweight.
+It is therefore all too easy to assume that RCU is guaranteeing more
+than it really is.
+Of course, the list of things that RCU does not guarantee is infinitely
+long, however, the following sections list a few non-guarantees that
+have caused confusion.
+Except where otherwise noted, these non-guarantees were premeditated.
+
+<ol>
+<li> <a href="#Readers Impose Minimal Ordering">
+ Readers Impose Minimal Ordering</a>
+<li> <a href="#Readers Do Not Exclude Updaters">
+ Readers Do Not Exclude Updaters</a>
+<li> <a href="#Updaters Only Wait For Old Readers">
+ Updaters Only Wait For Old Readers</a>
+<li> <a href="#Grace Periods Don't Partition Read-Side Critical Sections">
+ Grace Periods Don't Partition Read-Side Critical Sections</a>
+<li> <a href="#Read-Side Critical Sections Don't Partition Grace Periods">
+ Read-Side Critical Sections Don't Partition Grace Periods</a>
+<li> <a href="#Disabling Preemption Does Not Block Grace Periods">
+ Disabling Preemption Does Not Block Grace Periods</a>
+</ol>
+
+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
+
+<p>
+Reader-side markers such as <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
+except through their interaction with the grace-period APIs such as
+<tt>synchronize_rcu()</tt>.
+To see this, consider the following pair of threads:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(x, 1);
+ 5 rcu_read_unlock();
+ 6 rcu_read_lock();
+ 7 WRITE_ONCE(y, 1);
+ 8 rcu_read_unlock();
+ 9 }
+10
+11 void thread1(void)
+12 {
+13 rcu_read_lock();
+14 r1 = READ_ONCE(y);
+15 rcu_read_unlock();
+16 rcu_read_lock();
+17 r2 = READ_ONCE(x);
+18 rcu_read_unlock();
+19 }
+</pre>
+</blockquote>
+
+<p>
+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
+concurrently, it is quite possible to have
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 0)
+</pre>
+</blockquote>
+
+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
+which would not be possible if <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> had much in the way of ordering
+properties.
+But they do not, so the CPU is within its rights
+to do significant reordering.
+This is by design: Any significant ordering constraints would slow down
+these fast-path APIs.
+
+<p>@@QQ@@
+Can't the compiler also reorder this code?
+<p>@@QQA@@
+No, the volatile casts in <tt>READ_ONCE()</tt> and
+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
+this particular case.
+<p>@@QQE@@
+
+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
+
+<p>
+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
+exclude updates.
+All they do is to prevent grace periods from ending.
+The following example illustrates this:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 r1 = READ_ONCE(y);
+ 5 if (r1) {
+ 6 do_something_with_nonzero_x();
+ 7 r2 = READ_ONCE(x);
+ 8 WARN_ON(!r2); /* BUG!!! */
+ 9 }
+10 rcu_read_unlock();
+11 }
+12
+13 void thread1(void)
+14 {
+15 spin_lock(&my_lock);
+16 WRITE_ONCE(x, 1);
+17 WRITE_ONCE(y, 1);
+18 spin_unlock(&my_lock);
+19 }
+</pre>
+</blockquote>
+
+<p>
+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
+excluded the <tt>thread1()</tt> function's update,
+the <tt>WARN_ON()</tt> could never fire.
+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
+much of anything aside from subsequent grace periods, of which
+<tt>thread1()</tt> has none, so the
+<tt>WARN_ON()</tt> can and does fire.
+
+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
+
+<p>
+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
+completes, there are no readers executing.
+This temptation must be avoided because
+new readers can start immediately after <tt>synchronize_rcu()</tt>
+starts, and <tt>synchronize_rcu()</tt> is under no
+obligation to wait for these new readers.
+
+<p>@@QQ@@
+Suppose that synchronize_rcu() did wait until all readers had completed.
+Would the updater be able to rely on this?
+<p>@@QQA@@
+No.
+Even if <tt>synchronize_rcu()</tt> were to wait until
+all readers had completed, a new reader might start immediately after
+<tt>synchronize_rcu()</tt> completed.
+Therefore, the code following
+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
+in any case.
+<p>@@QQE@@
+
+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
+
+<p>
+It is tempting to assume that if any part of one RCU read-side critical
+section precedes a given grace period, and if any part of another RCU
+read-side critical section follows that same grace period, then all of
+the first RCU read-side critical section must precede all of the second.
+However, this just isn't the case: A single grace period does not
+partition the set of RCU read-side critical sections.
+An example of this situation can be illustrated as follows, where
+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 rcu_read_lock();
+19 r2 = READ_ONCE(b);
+20 r3 = READ_ONCE(c);
+21 rcu_read_unlock();
+22 }
+</pre>
+</blockquote>
+
+<p>
+It turns out that the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 0 && r3 == 1)
+</pre>
+</blockquote>
+
+is entirely possible.
+The following figure show how this can happen, with each circled
+<tt>QS</tt> indicating the point at which RCU recorded a
+<i>quiescent state</i> for each thread, that is, a state in which
+RCU knows that the thread cannot be in the midst of an RCU read-side
+critical section that started before the current grace period:
+
+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
+
+<p>
+If it is necessary to partition RCU read-side critical sections in this
+manner, it is necessary to use two grace periods, where the first
+grace period is known to end before the second grace period starts:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 r2 = READ_ONCE(c);
+19 synchronize_rcu();
+20 WRITE_ONCE(d, 1);
+21 }
+22
+23 void thread3(void)
+24 {
+25 rcu_read_lock();
+26 r3 = READ_ONCE(b);
+27 r4 = READ_ONCE(d);
+28 rcu_read_unlock();
+29 }
+</pre>
+</blockquote>
+
+<p>
+Here, if <tt>(r1 == 1)</tt>, then
+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
+before the end of <tt>thread1()</tt>'s grace period.
+If in addition <tt>(r4 == 1)</tt>, then
+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
+after the beginning of <tt>thread2()</tt>'s grace period.
+If it is also the case that <tt>(r2 == 1)</tt>, then the
+end of <tt>thread1()</tt>'s grace period must precede the
+beginning of <tt>thread2()</tt>'s grace period.
+This mean that the two RCU read-side critical sections cannot overlap,
+guaranteeing that <tt>(r3 == 1)</tt>.
+As a result, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 1 && r3 == 0 && r4 == 1)
+</pre>
+</blockquote>
+
+cannot happen.
+
+<p>
+This non-requirement was also non-premeditated, but became apparent
+when studying RCU's interaction with memory ordering.
+
+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
+
+<p>
+It is also tempting to assume that if an RCU read-side critical section
+happens between a pair of grace periods, then those grace periods cannot
+overlap.
+However, this temptation leads nowhere good, as can be illustrated by
+the following, with all variables initially zero:
+
+<blockquote>
+<pre>
+ 1 void thread0(void)
+ 2 {
+ 3 rcu_read_lock();
+ 4 WRITE_ONCE(a, 1);
+ 5 WRITE_ONCE(b, 1);
+ 6 rcu_read_unlock();
+ 7 }
+ 8
+ 9 void thread1(void)
+10 {
+11 r1 = READ_ONCE(a);
+12 synchronize_rcu();
+13 WRITE_ONCE(c, 1);
+14 }
+15
+16 void thread2(void)
+17 {
+18 rcu_read_lock();
+19 WRITE_ONCE(d, 1);
+20 r2 = READ_ONCE(c);
+21 rcu_read_unlock();
+22 }
+23
+24 void thread3(void)
+25 {
+26 r3 = READ_ONCE(d);
+27 synchronize_rcu();
+28 WRITE_ONCE(e, 1);
+29 }
+30
+31 void thread4(void)
+32 {
+33 rcu_read_lock();
+34 r4 = READ_ONCE(b);
+35 r5 = READ_ONCE(e);
+36 rcu_read_unlock();
+37 }
+</pre>
+</blockquote>
+
+<p>
+In this case, the outcome:
+
+<blockquote>
+<pre>
+(r1 == 1 && r2 == 1 && r3 == 1 && r4 == 0 && r5 == 1)
+</pre>
+</blockquote>
+
+is entirely possible, as illustrated below:
+
+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
+
+<p>
+Again, an RCU read-side critical section can overlap almost all of a
+given grace period, just so long as it does not overlap the entire
+grace period.
+As a result, an RCU read-side critical section cannot partition a pair
+of RCU grace periods.
+
+<p>@@QQ@@
+How long a sequence of grace periods, each separated by an RCU read-side
+critical section, would be required to partition the RCU read-side
+critical sections at the beginning and end of the chain?
+<p>@@QQA@@
+In theory, an infinite number.
+In practice, an unknown number that is sensitive to both implementation
+details and timing considerations.
+Therefore, even in practice, RCU users must abide by the theoretical rather
+than the practical answer.
+<p>@@QQE@@
+
+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
+Disabling Preemption Does Not Block Grace Periods</a></h3>
+
+<p>
+There was a time when disabling preemption on any given CPU would block
+subsequent grace periods.
+However, this was an accident of implementation and is not a requirement.
+And in the current Linux-kernel implementation, disabling preemption
+on a given CPU in fact does not block grace periods, as Oleg Nesterov
+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
+
+<p>
+If you need a preempt-disable region to block grace periods, you need to add
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
+as follows:
+
+<blockquote>
+<pre>
+ 1 preempt_disable();
+ 2 rcu_read_lock();
+ 3 do_something();
+ 4 rcu_read_unlock();
+ 5 preempt_enable();
+ 6
+ 7 /* Spinlocks implicitly disable preemption. */
+ 8 spin_lock(&mylock);
+ 9 rcu_read_lock();
+10 do_something();
+11 rcu_read_unlock();
+12 spin_unlock(&mylock);
+</pre>
+</blockquote>
+
+<p>
+In theory, you could enter the RCU read-side critical section first,
+but it is more efficient to keep the entire RCU read-side critical
+section contained in the preempt-disable region as shown above.
+Of course, RCU read-side critical sections that extend outside of
+preempt-disable regions will work correctly, but such critical sections
+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
+more work.
+And no, this is <i>not</i> an invitation to enclose all of your RCU
+read-side critical sections within preempt-disable regions, because
+doing so would degrade real-time response.
+
+<p>
+This non-requirement appeared with preemptible RCU.
+If you need a grace period that waits on non-preemptible code regions, use
+<a href="#Sched Flavor">RCU-sched</a>.
+
+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
+
+<p>
+These parallelism facts of life are by no means specific to RCU, but
+the RCU implementation must abide by them.
+They therefore bear repeating:
+
+<ol>
+<li> Any CPU or task may be delayed at any time,
+ and any attempts to avoid these delays by disabling
+ preemption, interrupts, or whatever are completely futile.
+ This is most obvious in preemptible user-level
+ environments and in virtualized environments (where
+ a given guest OS's VCPUs can be preempted at any time by
+ the underlying hypervisor), but can also happen in bare-metal
+ environments due to ECC errors, NMIs, and other hardware
+ events.
+ Although a delay of more than about 20 seconds can result
+ in splats, the RCU implementation is obligated to use
+ algorithms that can tolerate extremely long delays, but where
+ “extremely long” is not long enough to allow
+ wrap-around when incrementing a 64-bit counter.
+<li> Both the compiler and the CPU can reorder memory accesses.
+ Where it matters, RCU must use compiler directives and
+ memory-barrier instructions to preserve ordering.
+<li> Conflicting writes to memory locations in any given cache line
+ will result in expensive cache misses.
+ Greater numbers of concurrent writes and more-frequent
+ concurrent writes will result in more dramatic slowdowns.
+ RCU is therefore obligated to use algorithms that have
+ sufficient locality to avoid significant performance and
+ scalability problems.
+<li> As a rough rule of thumb, only one CPU's worth of processing
+ may be carried out under the protection of any given exclusive
+ lock.
+ RCU must therefore use scalable locking designs.
+<li> Counters are finite, especially on 32-bit systems.
+ RCU's use of counters must therefore tolerate counter wrap,
+ or be designed such that counter wrap would take way more
+ time than a single system is likely to run.
+ An uptime of ten years is quite possible, a runtime
+ of a century much less so.
+ As an example of the latter, RCU's dyntick-idle nesting counter
+ allows 54 bits for interrupt nesting level (this counter
+ is 64 bits even on a 32-bit system).
+ Overflowing this counter requires 2<sup>54</sup>
+ half-interrupts on a given CPU without that CPU ever going idle.
+ If a half-interrupt happened every microsecond, it would take
+ 570 years of runtime to overflow this counter, which is currently
+ believed to be an acceptably long time.
+<li> Linux systems can have thousands of CPUs running a single
+ Linux kernel in a single shared-memory environment.
+ RCU must therefore pay close attention to high-end scalability.
+</ol>
+
+<p>
+This last parallelism fact of life means that RCU must pay special
+attention to the preceding facts of life.
+The idea that Linux might scale to systems with thousands of CPUs would
+have been met with some skepticism in the 1990s, but these requirements
+would have otherwise have been unsurprising, even in the early 1990s.
+
+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
+
+<p>
+These sections list quality-of-implementation requirements.
+Although an RCU implementation that ignores these requirements could
+still be used, it would likely be subject to limitations that would
+make it inappropriate for industrial-strength production use.
+Classes of quality-of-implementation requirements are as follows:
+
+<ol>
+<li> <a href="#Specialization">Specialization</a>
+<li> <a href="#Performance and Scalability">Performance and Scalability</a>
+<li> <a href="#Composability">Composability</a>
+<li> <a href="#Corner Cases">Corner Cases</a>
+</ol>
+
+<p>
+These classes is covered in the following sections.
+
+<h3><a name="Specialization">Specialization</a></h3>
+
+<p>
+RCU is and always has been intended primarily for read-mostly situations, as
+illustrated by the following figure.
+This means that RCU's read-side primitives are optimized, often at the
+expense of its update-side primitives.
+
+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
+
+<p>
+This focus on read-mostly situations means that RCU must interoperate
+with other synchronization primitives.
+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
+examples discussed earlier use RCU to protect readers and locking to
+coordinate updaters.
+However, the need extends much farther, requiring that a variety of
+synchronization primitives be legal within RCU read-side critical sections,
+including spinlocks, sequence locks, atomic operations, reference
+counters, and memory barriers.
+
+<p>@@QQ@@
+What about sleeping locks?
+<p>@@QQA@@
+These are forbidden within Linux-kernel RCU read-side critical sections
+because it is not legal to place a quiescent state (in this case,
+voluntary context switch) within an RCU read-side critical section.
+However, sleeping locks may be used within userspace RCU read-side critical
+sections, and also within Linux-kernel sleepable RCU
+<a href="#Sleepable RCU">(SRCU)</a>
+read-side critical sections.
+In addition, the -rt patchset turns spinlocks into a sleeping locks so
+that the corresponding critical sections can be preempted, which
+also means that these sleeplockified spinlocks (but not other sleeping locks!)
+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
+
+<p>
+Note that it <i>is</i> legal for a normal RCU read-side critical section
+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
+but only as long as it does not loop indefinitely attempting to
+conditionally acquire that sleeping locks.
+The key point is that things like <tt>mutex_trylock()</tt>
+either return with the mutex held, or return an error indication if
+the mutex was not immediately available.
+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
+<p>@@QQE@@
+
+<p>
+It often comes as a surprise that many algorithms do not require a
+consistent view of data, but many can function in that mode,
+with network routing being the poster child.
+Internet routing algorithms take significant time to propagate
+updates, so that by the time an update arrives at a given system,
+that system has been sending network traffic the wrong way for
+a considerable length of time.
+Having a few threads continue to send traffic the wrong way for a
+few more milliseconds is clearly not a problem: In the worst case,
+TCP retransmissions will eventually get the data where it needs to go.
+In general, when tracking the state of the universe outside of the
+computer, some level of inconsistency must be tolerated due to
+speed-of-light delays if nothing else.
+
+<p>
+Furthermore, uncertainty about external state is inherent in many cases.
+For example, a pair of veternarians might use heartbeat to determine
+whether or not a given cat was alive.
+But how long should they wait after the last heartbeat to decide that
+the cat is in fact dead?
+Waiting less than 400 milliseconds makes no sense because this would
+mean that a relaxed cat would be considered to cycle between death
+and life more than 100 times per minute.
+Moreover, just as with human beings, a cat's heart might stop for
+some period of time, so the exact wait period is a judgment call.
+One of our pair of veternarians might wait 30 seconds before pronouncing
+the cat dead, while the other might insist on waiting a full minute.
+The two veternarians would then disagree on the state of the cat during
+the final 30 seconds of the minute following the last heartbeat, as
+fancifully illustrated below:
+
+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
+
+<p>
+Interestingly enough, this same situation applies to hardware.
+When push comes to shove, how do we tell whether or not some
+external server has failed?
+We send messages to it periodically, and declare it failed if we
+don't receive a response within a given period of time.
+Policy decisions can usually tolerate short
+periods of inconsistency.
+The policy was decided some time ago, and is only now being put into
+effect, so a few milliseconds of delay is normally inconsequential.
+
+<p>
+However, there are algorithms that absolutely must see consistent data.
+For example, the translation between a user-level SystemV semaphore
+ID to the corresponding in-kernel data structure is protected by RCU,
+but it is absolutely forbidden to update a semaphore that has just been
+removed.
+In the Linux kernel, this need for consistency is accommodated by acquiring
+spinlocks located in the in-kernel data structure from within
+the RCU read-side critical section, and this is indicated by the
+green box in the figure above.
+Many other techniques may be used, and are in fact used within the
+Linux kernel.
+
+<p>
+In short, RCU is not required to maintain consistency, and other
+mechanisms may be used in concert with RCU when consistency is required.
+RCU's specialization allows it to do its job extremely well, and its
+ability to interoperate with other synchronization mechanisms allows
+the right mix of synchronization tools to be used for a given job.
+
+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
+
+<p>
+Energy efficiency is a critical component of performance today,
+and Linux-kernel RCU implementations must therefore avoid unnecessarily
+awakening idle CPUs.
+I cannot claim that this requirement was premeditated.
+In fact, I learned of it during a telephone conversation in which I
+was given “frank and open” feedback on the importance
+of energy efficiency in battery-powered systems and on specific
+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
+In my experience, the battery-powered embedded community will consider
+any unnecessary wakeups to be extremely unfriendly acts.
+So much so that mere Linux-kernel-mailing-list posts are
+insufficient to vent their ire.
+
+<p>
+Memory consumption is not particularly important for in most
+situations, and has become decreasingly
+so as memory sizes have expanded and memory
+costs have plummeted.
+However, as I learned from Matt Mackall's
+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
+efforts, memory footprint is critically important on single-CPU systems with
+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
+was born.
+Josh Triplett has since taken over the small-memory banner with his
+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
+project, which resulted in
+<a href="#Sleepable RCU">SRCU</a>
+becoming optional for those kernels not needing it.
+
+<p>
+The remaining performance requirements are, for the most part,
+unsurprising.
+For example, in keeping with RCU's read-side specialization,
+<tt>rcu_dereference()</tt> should have negligible overhead (for
+example, suppression of a few minor compiler optimizations).
+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
+
+<p>
+In preemptible environments, in the case where the RCU read-side
+critical section was not preempted (as will be the case for the
+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
+<tt>rcu_read_unlock()</tt> should have minimal overhead.
+In particular, they should not contain atomic read-modify-write
+operations, memory-barrier instructions, preemption disabling,
+interrupt disabling, or backwards branches.
+However, in the case where the RCU read-side critical section was preempted,
+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
+This is why it is better to nest an RCU read-side critical section
+within a preempt-disable region than vice versa, at least in cases
+where that critical section is short enough to avoid unduly degrading
+real-time latencies.
+
+<p>
+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
+optimized for throughput.
+It may therefore incur several milliseconds of latency in addition to
+the duration of the longest RCU read-side critical section.
+On the other hand, multiple concurrent invocations of
+<tt>synchronize_rcu()</tt> are required to use batching optimizations
+so that they can be satisfied by a single underlying grace-period-wait
+operation.
+For example, in the Linux kernel, it is not unusual for a single
+grace-period-wait operation to serve more than
+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
+overhead down to nearly zero.
+However, the grace-period optimization is also required to avoid
+measurable degradation of real-time scheduling and interrupt latencies.
+
+<p>
+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
+latencies are unacceptable.
+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
+instead, reducing the grace-period latency down to a few tens of
+microseconds on small systems, at least in cases where the RCU read-side
+critical sections are short.
+There are currently no special latency requirements for
+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
+consistent with the empirical nature of the RCU specification,
+that is subject to change.
+However, there most definitely are scalability requirements:
+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
+CPUs should at least make reasonable forward progress.
+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
+is permitted to impose modest degradation of real-time latency
+on non-idle online CPUs.
+That said, it will likely be necessary to take further steps to reduce this
+degradation, hopefully to roughly that of a scheduling-clock interrupt.
+
+<p>
+There are a number of situations where even
+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
+latency is unacceptable.
+In these situations, the asynchronous <tt>call_rcu()</tt> can be
+used in place of <tt>synchronize_rcu()</tt> as follows:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2 int a;
+ 3 int b;
+ 4 struct rcu_head rh;
+ 5 };
+ 6
+ 7 static void remove_gp_cb(struct rcu_head *rhp)
+ 8 {
+ 9 struct foo *p = container_of(rhp, struct foo, rh);
+10
+11 kfree(p);
+12 }
+13
+14 bool remove_gp_asynchronous(void)
+15 {
+16 struct foo *p;
+17
+18 spin_lock(&gp_lock);
+19 p = rcu_dereference(gp);
+20 if (!p) {
+21 spin_unlock(&gp_lock);
+22 return false;
+23 }
+24 rcu_assign_pointer(gp, NULL);
+25 call_rcu(&p->rh, remove_gp_cb);
+26 spin_unlock(&gp_lock);
+27 return true;
+28 }
+</pre>
+</blockquote>
+
+<p>
+A definition of <tt>struct foo</tt> is finally needed, and appears
+on lines 1-5.
+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
+on line 25, and will be invoked after the end of a subsequent
+grace period.
+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
+but without forcing the updater to wait for a grace period to elapse.
+The <tt>call_rcu()</tt> function may be used in a number of
+situations where neither <tt>synchronize_rcu()</tt> nor
+<tt>synchronize_rcu_expedited()</tt> would be legal,
+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
+interrupt-disable code, and interrupt handlers.
+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
+executed within softirq (software interrupt) environment within the
+Linux kernel,
+either within a real softirq handler or under the protection
+of <tt>local_bh_disable()</tt>.
+In both the Linux kernel and in userspace, it is bad practice to
+write an RCU callback function that takes too long.
+Long-running operations should be relegated to separate threads or
+(in the Linux kernel) workqueues.
+
+<p>@@QQ@@
+Why does line 19 use <tt>rcu_access_pointer()</tt>?
+After all, <tt>call_rcu()</tt> on line 25 stores into the
+structure, which would interact badly with concurrent insertions.
+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
+<p>@@QQA@@
+Presumably the <tt>->gp_lock</tt> acquired on line 18 excludes
+any changes, including any insertions that <tt>rcu_dereference()</tt>
+would protect against.
+Therefore, any insertions will be delayed until after <tt>->gp_lock</tt>
+is released on line 25, which in turn means that
+<tt>rcu_access_pointer()</tt> suffices.
+<p>@@QQE@@
+
+<p>
+However, all that <tt>remove_gp_cb()</tt> is doing is
+invoking <tt>kfree()</tt> on the data element.
+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
+which allows “fire and forget” operation as shown below:
+
+<blockquote>
+<pre>
+ 1 struct foo {
+ 2 int a;
+ 3 int b;
+ 4 struct rcu_head rh;
+ 5 };
+ 6
+ 7 bool remove_gp_faf(void)
+ 8 {
+ 9 struct foo *p;
+10
+11 spin_lock(&gp_lock);
+12 p = rcu_dereference(gp);
+13 if (!p) {
+14 spin_unlock(&gp_lock);
+15 return false;
+16 }
+17 rcu_assign_pointer(gp, NULL);
+18 kfree_rcu(p, rh);
+19 spin_unlock(&gp_lock);
+20 return true;
+21 }
+</pre>
+</blockquote>
+
+<p>
+Note that <tt>remove_gp_faf()</tt> simply invokes
+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
+further attention to the subsequent grace period and <tt>kfree()</tt>.
+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
+environments as for <tt>call_rcu()</tt>.
+Interestingly enough, DYNIX/ptx had the equivalents of
+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
+<tt>synchronize_rcu()</tt>.
+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
+so the very few places that needed something like
+<tt>synchronize_rcu()</tt> simply open-coded it.
+
+<p>@@QQ@@
+Earlier it was claimed that <tt>call_rcu()</tt> and
+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
+by readers.
+But how can that be correct, given that the invocation of the callback
+and the freeing of the memory (respectively) must still wait for
+a grace period to elapse?
+<p>@@QQA@@
+We could define things this way, but keep in mind that this sort of
+definition would say that updates in garbage-collected languages
+cannot complete until the next time the garbage collector runs,
+which does not seem at all reasonable.
+The key point is that in most cases, an updater using either
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
+next update as soon as it has invoked <tt>call_rcu()</tt> or
+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
+grace period.
+<p>@@QQE@@
+
+<p>
+But what if the updater must wait for the completion of code to be
+executed after the end of the grace period, but has other tasks
+that can be carried out in the meantime?
+The polling-style <tt>get_state_synchronize_rcu()</tt> and
+<tt>cond_synchronize_rcu()</tt> functions may be used for this
+purpose, as shown below:
+
+<blockquote>
+<pre>
+ 1 bool remove_gp_poll(void)
+ 2 {
+ 3 struct foo *p;
+ 4 unsigned long s;
+ 5
+ 6 spin_lock(&gp_lock);
+ 7 p = rcu_access_pointer(gp);
+ 8 if (!p) {
+ 9 spin_unlock(&gp_lock);
+10 return false;
+11 }
+12 rcu_assign_pointer(gp, NULL);
+13 spin_unlock(&gp_lock);
+14 s = get_state_synchronize_rcu();
+15 do_something_while_waiting();
+16 cond_synchronize_rcu(s);
+17 kfree(p);
+18 return true;
+19 }
+</pre>
+</blockquote>
+
+<p>
+On line 14, <tt>get_state_synchronize_rcu()</tt> obtains a
+“cookie” from RCU,
+then line 15 carries out other tasks,
+and finally, line 16 returns immediately if a grace period has
+elapsed in the meantime, but otherwise waits as required.
+The need for <tt>get_state_synchronize_rcu</tt> and
+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
+so it is too early to tell whether they will stand the test of time.
+
+<p>
+RCU thus provides a range of tools to allow updaters to strike the
+required tradeoff between latency, flexibility and CPU overhead.
+
+<h3><a name="Composability">Composability</a></h3>
+
+<p>
+Composability has received much attention in recent years, perhaps in part
+due to the collision of multicore hardware with object-oriented techniques
+designed in single-threaded environments for single-threaded use.
+And in theory, RCU read-side critical sections may be composed, and in
+fact may be nested arbitrarily deeply.
+In practice, as with all real-world implementations of composable
+constructs, there are limitations.
+
+<p>
+Implementations of RCU for which <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> generate no code, such as
+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
+nested arbitrarily deeply.
+After all, there is no overhead.
+Except that if all these instances of <tt>rcu_read_lock()</tt>
+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
+compilation will eventually fail due to exhausting memory,
+mass storage, or user patience, whichever comes first.
+If the nesting is not visible to the compiler, as is the case with
+mutually recursive functions each in its own translation unit,
+stack overflow will result.
+If the nesting takes the form of loops, either the control variable
+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
+Nevertheless, this class of RCU implementations is one
+of the most composable constructs in existence.
+
+<p>
+RCU implementations that explicitly track nesting depth
+are limited by the nesting-depth counter.
+For example, the Linux kernel's preemptible RCU limits nesting to
+<tt>INT_MAX</tt>.
+This should suffice for almost all practical purposes.
+That said, a consecutive pair of RCU read-side critical sections
+between which there is an operation that waits for a grace period
+cannot be enclosed in another RCU read-side critical section.
+This is because it is not legal to wait for a grace period within
+an RCU read-side critical section: To do so would result either
+in deadlock or
+in RCU implicitly splitting the enclosing RCU read-side critical
+section, neither of which is conducive to a long-lived and prosperous
+kernel.
+
+<p>
+It is worth noting that RCU is not alone in limiting composability.
+For example, many transactional-memory implementations prohibit
+composing a pair of transactions separated by an irrevocable
+operation (for example, a network receive operation).
+For another example, lock-based critical sections can be composed
+surprisingly freely, but only if deadlock is avoided.
+
+<p>
+In short, although RCU read-side critical sections are highly composable,
+care is required in some situations, just as is the case for any other
+composable synchronization mechanism.
+
+<h3><a name="Corner Cases">Corner Cases</a></h3>
+
+<p>
+A given RCU workload might have an endless and intense stream of
+RCU read-side critical sections, perhaps even so intense that there
+was never a point in time during which there was not at least one
+RCU read-side critical section in flight.
+RCU cannot allow this situation to block grace periods: As long as
+all the RCU read-side critical sections are finite, grace periods
+must also be finite.
+
+<p>
+That said, preemptible RCU implementations could potentially result
+in RCU read-side critical sections being preempted for long durations,
+which has the effect of creating a long-duration RCU read-side
+critical section.
+This situation can arise only in heavily loaded systems, but systems using
+real-time priorities are of course more vulnerable.
+Therefore, RCU priority boosting is provided to help deal with this
+case.
+That said, the exact requirements on RCU priority boosting will likely
+evolve as more experience accumulates.
+
+<p>
+Other workloads might have very high update rates.
+Although one can argue that such workloads should instead use
+something other than RCU, the fact remains that RCU must
+handle such workloads gracefully.
+This requirement is another factor driving batching of grace periods,
+but it is also the driving force behind the checks for large numbers
+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
+Finally, high update rates should not delay RCU read-side critical
+sections, although some read-side delays can occur when using
+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
+of <tt>try_stop_cpus()</tt>.
+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
+converted to use lighter-weight inter-processor interrupts (IPIs),
+but this will still disturb readers, though to a much smaller degree.)
+
+<p>
+Although all three of these corner cases were understood in the early
+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
+in a tight loop
+in the early 2000s suddenly provided a much deeper appreciation of the
+high-update-rate corner case.
+This test also motivated addition of some RCU code to react to high update
+rates, for example, if a given CPU finds itself with more than 10,000
+RCU callbacks queued, it will cause RCU to take evasive action by
+more aggressively starting grace periods and more aggressively forcing
+completion of grace-period processing.
+This evasive action causes the grace period to complete more quickly,
+but at the cost of restricting RCU's batching optimizations, thus
+increasing the CPU overhead incurred by that grace period.
+
+<h2><a name="Software-Engineering Requirements">
+Software-Engineering Requirements</a></h2>
+
+<p>
+Between Murphy's Law and “To err is human”, it is necessary to
+guard against mishaps and misuse:
+
+<ol>
+<li> It is all too easy to forget to use <tt>rcu_read_lock()</tt>
+ everywhere that it is needed, so kernels built with
+ <tt>CONFIG_PROVE_RCU=y</tt> will spat if
+ <tt>rcu_dereference()</tt> is used outside of an
+ RCU read-side critical section.
+ Update-side code can use <tt>rcu_dereference_protected()</tt>,
+ which takes a
+ <a href="https://lwn.net/Articles/371986/">lockdep expression</a>
+ to indicate what is providing the protection.
+ If the indicated protection is not provided, a lockdep splat
+ is emitted.
+
+ <p>
+ Code shared between readers and updaters can use
+ <tt>rcu_dereference_check()</tt>, which also takes a
+ lockdep expression, and emits a lockdep splat if neither
+ <tt>rcu_read_lock()</tt> nor the indicated protection
+ is in place.
+ In addition, <tt>rcu_dereference_raw()</tt> is used in those
+ (hopefully rare) cases where the required protection cannot
+ be easily described.
+ Finally, <tt>rcu_read_lock_held()</tt> is provided to
+ allow a function to verify that it has been invoked within
+ an RCU read-side critical section.
+ I was made aware of this set of requirements shortly after Thomas
+ Gleixner audited a number of RCU uses.
+<li> A given function might wish to check for RCU-related preconditions
+ upon entry, before using any other RCU API.
+ The <tt>rcu_lockdep_assert()</tt> does this job,
+ asserting the expression in kernels having lockdep enabled
+ and doing nothing otherwise.
+<li> It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
+ and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
+ substituting a simple assignment.
+ To catch this sort of error, a given RCU-protected pointer may be
+ tagged with <tt>__rcu</tt>, after which running sparse
+ with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
+ about simple-assignment accesses to that pointer.
+ Arnd Bergmann made me aware of this requirement, and also
+ supplied the needed
+ <a href="https://lwn.net/Articles/376011/">patch series</a>.
+<li> Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
+ will splat if a data element is passed to <tt>call_rcu()</tt>
+ twice in a row, without a grace period in between.
+ (This error is similar to a double free.)
+ The corresponding <tt>rcu_head</tt> structures that are
+ dynamically allocated are automatically tracked, but
+ <tt>rcu_head</tt> structures allocated on the stack
+ must be initialized with <tt>init_rcu_head_on_stack()</tt>
+ and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
+ Similarly, statically allocated non-stack <tt>rcu_head</tt>
+ structures must be initialized with <tt>init_rcu_head()</tt>
+ and cleaned up with <tt>destroy_rcu_head()</tt>.
+ Mathieu Desnoyers made me aware of this requirement, and also
+ supplied the needed
+ <a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
+<li> An infinite loop in an RCU read-side critical section will
+ eventually trigger an RCU CPU stall warning splat, with
+ the duration of “eventually” being controlled by the
+ <tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
+ alternatively, by the
+ <tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
+ parameter.
+ However, RCU is not obligated to produce this splat
+ unless there is a grace period waiting on that particular
+ RCU read-side critical section.
+ <p>
+ Some extreme workloads might intentionally delay
+ RCU grace periods, and systems running those workloads can
+ be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
+ to suppress the splats.
+ This kernel parameter may also be set via <tt>sysfs</tt>.
+ Furthermore, RCU CPU stall warnings are counter-productive
+ during sysrq dumps and during panics.
+ RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
+ <tt>rcu_sysrq_end()</tt> API members to be called before
+ and after long sysrq dumps.
+ RCU also supplies the <tt>rcu_panic()</tt> notifier that is
+ automatically invoked at the beginning of a panic to suppress
+ further RCU CPU stall warnings.
+
+ <p>
+ This requirement made itself known in the early 1990s, pretty
+ much the first time that it was necessary to debug a CPU stall.
+ That said, the initial implementation in DYNIX/ptx was quite
+ generic in comparison with that of Linux.
+<li> Although it would be very good to detect pointers leaking out
+ of RCU read-side critical sections, there is currently no
+ good way of doing this.
+ One complication is the need to distinguish between pointers
+ leaking and pointers that have been handed off from RCU to
+ some other synchronization mechanism, for example, reference
+ counting.
+<li> In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
+ information is provided via both debugfs and event tracing.
+<li> Open-coded use of <tt>rcu_assign_pointer()</tt> and
+ <tt>rcu_dereference()</tt> to create typical linked
+ data structures can be surprisingly error-prone.
+ Therefore, RCU-protected
+ <a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
+ and, more recently, RCU-protected
+ <a href="https://lwn.net/Articles/612100/">hash tables</a>
+ are available.
+ Many other special-purpose RCU-protected data structures are
+ available in the Linux kernel and the userspace RCU library.
+<li> Some linked structures are created at compile time, but still
+ require <tt>__rcu</tt> checking.
+ The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
+ purpose.
+<li> It is not necessary to use <tt>rcu_assign_pointer()</tt>
+ when creating linked structures that are to be published via
+ a single external pointer.
+ The <tt>RCU_INIT_POINTER()</tt> macro is provided for
+ this task and also for assigning <tt>NULL</tt> pointers
+ at runtime.
+</ol>
+
+<p>
+This not a hard-and-fast list: RCU's diagnostic capabilities will
+continue to be guided by the number and type of usage bugs found
+in real-world RCU usage.
+
+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
+
+<p>
+The Linux kernel provides an interesting environment for all kinds of
+software, including RCU.
+Some of the relevant points of interest are as follows:
+
+<ol>
+<li> <a href="#Configuration">Configuration</a>.
+<li> <a href="#Firmware Interface">Firmware Interface</a>.
+<li> <a href="#Early Boot">Early Boot</a>.
+<li> <a href="#Interrupts and NMIs">
+ Interrupts and non-maskable interrupts (NMIs)</a>.
+<li> <a href="#Loadable Modules">Loadable Modules</a>.
+<li> <a href="#Hotplug CPU">Hotplug CPU</a>.
+<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
+<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
+<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
+<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
+<li> <a href="#Performance, Scalability, Response Time, and Reliability">
+ Performance, Scalability, Response Time, and Reliability</a>.
+</ol>
+
+<p>
+This list is probably incomplete, but it does give a feel for the
+most notable Linux-kernel complications.
+Each of the following sections covers one of the above topics.
+
+<h3><a name="Configuration">Configuration</a></h3>
+
+<p>
+RCU's goal is automatic configuration, so that almost nobody
+needs to worry about RCU's <tt>Kconfig</tt> options.
+And for almost all users, RCU does in fact work well
+“out of the box.”
+
+<p>
+However, there are specialized use cases that are handled by
+kernel boot parameters and <tt>Kconfig</tt> options.
+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
+about new <tt>Kconfig</tt> options, which requires almost all of them
+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
+
+<p>
+This all should be quite obvious, but the fact remains that
+Linus Torvalds recently had to
+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
+me of this requirement.
+
+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
+
+<p>
+In many cases, kernel obtains information about the system from the
+firmware, and sometimes things are lost in translation.
+Or the translation is accurate, but the original message is bogus.
+
+<p>
+For example, some systems' firmware overreports the number of CPUs,
+sometimes by a large factor.
+If RCU naively believed the firmware, as it used to do,
+it would create too many per-CPU kthreads.
+Although the resulting system will still run correctly, the extra
+kthreads needlessly consume memory and can cause confusion
+when they show up in <tt>ps</tt> listings.
+
+<p>
+RCU must therefore wait for a given CPU to actually come online before
+it can allow itself to believe that the CPU actually exists.
+The resulting “ghost CPUs” (which are never going to
+come online) cause a number of
+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
+
+<h3><a name="Early Boot">Early Boot</a></h3>
+
+<p>
+The Linux kernel's boot sequence is an interesting process,
+and RCU is used early, even before <tt>rcu_init()</tt>
+is invoked.
+In fact, a number of RCU's primitives can be used as soon as the
+initial task's <tt>task_struct</tt> is available and the
+boot CPU's per-CPU variables are set up.
+The read-side primitives (<tt>rcu_read_lock()</tt>,
+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
+as will <tt>rcu_assign_pointer()</tt>.
+
+<p>
+Although <tt>call_rcu()</tt> may be invoked at any
+time during boot, callbacks are not guaranteed to be invoked until after
+the scheduler is fully up and running.
+This delay in callback invocation is due to the fact that RCU does not
+invoke callbacks until it is fully initialized, and this full initialization
+cannot occur until after the scheduler has initialized itself to the
+point where RCU can spawn and run its kthreads.
+In theory, it would be possible to invoke callbacks earlier,
+however, this is not a panacea because there would be severe restrictions
+on what operations those callbacks could invoke.
+
+<p>
+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
+(<a href="#Bottom-Half Flavor">discussed below</a>),
+and
+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
+will all operate normally
+during very early boot, the reason being that there is only one CPU
+and preemption is disabled.
+This means that the call <tt>synchronize_rcu()</tt> (or friends)
+itself is a quiescent
+state and thus a grace period, so the early-boot implementation can
+be a no-op.
+
+<p>
+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
+continue to operate normally through the remainder of boot, courtesy
+of the fact that preemption is disabled across their RCU read-side
+critical sections and also courtesy of the fact that there is still
+only one CPU.
+However, once the scheduler starts initializing, preemption is enabled.
+There is still only a single CPU, but the fact that preemption is enabled
+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
+Therefore, as soon as the scheduler starts initializing, the early-boot
+fastpath is disabled.
+This means that <tt>synchronize_rcu()</tt> switches to its runtime
+mode of operation where it posts callbacks, which in turn means that
+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
+callback is invoked.
+Unfortunately, the callback cannot be invoked until RCU's runtime
+grace-period machinery is up and running, which cannot happen until
+the scheduler has initialized itself sufficiently to allow RCU's
+kthreads to be spawned.
+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
+initialization can result in deadlock.
+
+<p>@@QQ@@
+So what happens with <tt>synchronize_rcu()</tt> during
+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
+kernels?
+<p>@@QQA@@
+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
+maps directly to <tt>synchronize_sched()</tt>.
+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
+during scheduler initialization.
+<p>@@QQE@@
+
+<p>
+I learned of these boot-time requirements as a result of a series of
+system hangs.
+
+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
+
+<p>
+The Linux kernel has interrupts, and RCU read-side critical sections are
+legal within interrupt handlers and within interrupt-disabled regions
+of code, as are invocations of <tt>call_rcu()</tt>.
+
+<p>
+Some Linux-kernel architectures can enter an interrupt handler from
+non-idle process context, and then just never leave it, instead stealthily
+transitioning back to process context.
+This trick is sometimes used to invoke system calls from inside the kernel.
+These “half-interrupts” mean that RCU has to be very careful
+about how it counts interrupt nesting levels.
+I learned of this requirement the hard way during a rewrite
+of RCU's dyntick-idle code.
+
+<p>
+The Linux kernel has non-maskable interrupts (NMIs), and
+RCU read-side critical sections are legal within NMI handlers.
+Thankfully, RCU update-side primitives, including
+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
+
+<p>
+The name notwithstanding, some Linux-kernel architectures
+can have nested NMIs, which RCU must handle correctly.
+Andy Lutomirski
+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
+with this requirement;
+he also kindly surprised me with
+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
+that meets this requirement.
+
+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
+
+<p>
+The Linux kernel has loadable modules, and these modules can
+also be unloaded.
+After a given module has been unloaded, any attempt to call
+one of its functions results in a segmentation fault.
+The module-unload functions must therefore cancel any
+delayed calls to loadable-module functions, for example,
+any outstanding <tt>mod_timer()</tt> must be dealt with
+via <tt>del_timer_sync()</tt> or similar.
+
+<p>
+Unfortunately, there is no way to cancel an RCU callback;
+once you invoke <tt>call_rcu()</tt>, the callback function is
+going to eventually be invoked, unless the system goes down first.
+Because it is normally considered socially irresponsible to crash the system
+in response to a module unload request, we need some other way
+to deal with in-flight RCU callbacks.
+
+<p>
+RCU therefore provides
+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
+which waits until all in-flight RCU callbacks have been invoked.
+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
+<tt>rcu_barrier()</tt>.
+In theory, the underlying module-unload code could invoke
+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
+incur unacceptable latencies.
+
+<p>
+Nikita Danilov noted this requirement for an analogous filesystem-unmount
+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
+The need for <tt>rcu_barrier()</tt> for module unloading became
+apparent later.
+
+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
+
+<p>
+The Linux kernel supports CPU hotplug, which means that CPUs
+can come and go.
+It is of course illegal to use any RCU API member from an offline CPU.
+This requirement was present from day one in DYNIX/ptx, but
+on the other hand, the Linux kernel's CPU-hotplug implementation
+is “interesting.”
+
+<p>
+The Linux-kernel CPU-hotplug implementation has notifiers that
+are used to allow the various kernel subsystems (including RCU)
+to respond appropriately to a given CPU-hotplug operation.
+Most RCU operations may be invoked from CPU-hotplug notifiers,
+including even normal synchronous grace-period operations
+such as <tt>synchronize_rcu()</tt>.
+However, expedited grace-period operations such as
+<tt>synchronize_rcu_expedited()</tt> are not supported,
+due to the fact that current implementations block CPU-hotplug
+operations, which could result in deadlock.
+
+<p>
+In addition, all-callback-wait operations such as
+<tt>rcu_barrier()</tt> are also not supported, due to the
+fact that there are phases of CPU-hotplug operations where
+the outgoing CPU's callbacks will not be invoked until after
+the CPU-hotplug operation ends, which could also result in deadlock.
+
+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
+
+<p>
+RCU depends on the scheduler, and the scheduler uses RCU to
+protect some of its data structures.
+This means the scheduler is forbidden from acquiring
+the runqueue locks and the priority-inheritance locks
+in the middle of an outermost RCU read-side critical section unless either
+(1) it releases them before exiting that same
+RCU read-side critical section, or
+(2) interrupts are disabled across
+that entire RCU read-side critical section.
+This same prohibition also applies (recursively!) to any lock that is acquired
+while holding any lock to which this prohibition applies.
+Adhering to this rule prevents preemptible RCU from invoking
+<tt>rcu_read_unlock_special()</tt> while either runqueue or
+priority-inheritance locks are held, thus avoiding deadlock.
+
+<p>
+Prior to v4.4, it was only necessary to disable preemption across
+RCU read-side critical sections that acquired scheduler locks.
+In v4.4, expedited grace periods started using IPIs, and these
+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
+Therefore, this expedited-grace-period change required disabling of
+interrupts, not just preemption.
+
+<p>
+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
+implementation must be written carefully to avoid similar deadlocks.
+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
+interrupt where the interrupt handler invokes both
+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+This possibility requires <tt>rcu_read_unlock()</tt> to use
+negative nesting levels to avoid destructive recursion via
+interrupt handler's use of RCU.
+
+<p>
+This pair of mutual scheduler-RCU requirements came as a
+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
+
+<p>
+As noted above, RCU makes use of kthreads, and it is necessary to
+avoid excessive CPU-time accumulation by these kthreads.
+This requirement was no surprise, but RCU's violation of it
+when running context-switch-heavy workloads when built with
+<tt>CONFIG_NO_HZ_FULL=y</tt>
+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
+RCU has made good progress towards meeting this requirement, even
+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
+but there is room for further improvement.
+
+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
+
+<p>
+It is possible to use tracing on RCU code, but tracing itself
+uses RCU.
+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
+is provided for use by tracing, which avoids the destructive
+recursion that could otherwise ensue.
+This API is also used by virtualization in some architectures,
+where RCU readers execute in environments in which tracing
+cannot be used.
+The tracing folks both located the requirement and provided the
+needed fix, so this surprise requirement was relatively painless.
+
+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
+
+<p>
+Interrupting idle CPUs is considered socially unacceptable,
+especially by people with battery-powered embedded systems.
+RCU therefore conserves energy by detecting which CPUs are
+idle, including tracking CPUs that have been interrupted from idle.
+This is a large part of the energy-efficiency requirement,
+so I learned of this via an irate phone call.
+
+<p>
+Because RCU avoids interrupting idle CPUs, it is illegal to
+execute an RCU read-side critical section on an idle CPU.
+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
+if you try it.)
+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
+event tracing is provided to work around this restriction.
+In addition, <tt>rcu_is_watching()</tt> may be used to
+test whether or not it is currently legal to run RCU read-side
+critical sections on this CPU.
+I learned of the need for diagnostics on the one hand
+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
+idle-loop code.
+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
+which is used quite heavily in the idle loop.
+
+<p>
+It is similarly socially unacceptable to interrupt an
+<tt>nohz_full</tt> CPU running in userspace.
+RCU must therefore track <tt>nohz_full</tt> userspace
+execution.
+And in
+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
+kernels, RCU must separately track idle CPUs on the one hand and
+CPUs that are either idle or executing in userspace on the other.
+In both cases, RCU must be able to sample state at two points in
+time, and be able to determine whether or not some other CPU spent
+any time idle and/or executing in userspace.
+
+<p>
+These energy-efficiency requirements have proven quite difficult to
+understand and to meet, for example, there have been more than five
+clean-sheet rewrites of RCU's energy-efficiency code, the last of
+which was finally able to demonstrate
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
+As noted earlier,
+I learned of many of these requirements via angry phone calls:
+Flaming me on the Linux-kernel mailing list was apparently not
+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
+
+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
+
+<p>
+Although small-memory non-realtime systems can simply use Tiny RCU,
+code size is only one aspect of memory efficiency.
+Another aspect is the size of the <tt>rcu_head</tt> structure
+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
+Although this structure contains nothing more than a pair of pointers,
+it does appear in many RCU-protected data structures, including
+some that are size critical.
+The <tt>page</tt> structure is a case in point, as evidenced by
+the many occurrences of the <tt>union</tt> keyword within that structure.
+
+<p>
+This need for memory efficiency is one reason that RCU uses hand-crafted
+singly linked lists to track the <tt>rcu_head</tt> structures that
+are waiting for a grace period to elapse.
+It is also the reason why <tt>rcu_head</tt> structures do not contain
+debug information, such as fields tracking the file and line of the
+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
+Although this information might appear in debug-only kernel builds at some
+point, in the meantime, the <tt>->func</tt> field will often provide
+the needed debug information.
+
+<p>
+However, in some cases, the need for memory efficiency leads to even
+more extreme measures.
+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
+shares storage with a great many other structures that are used at
+various points in the corresponding page's lifetime.
+In order to correctly resolve certain
+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
+the Linux kernel's memory-management subsystem needs a particular bit
+to remain zero during all phases of grace-period processing,
+and that bit happens to map to the bottom bit of the
+<tt>rcu_head</tt> structure's <tt>->next</tt> field.
+RCU makes this guarantee as long as <tt>call_rcu()</tt>
+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
+or some future “lazy”
+variant of <tt>call_rcu()</tt> that might one day be created for
+energy-efficiency purposes.
+
+<h3><a name="Performance, Scalability, Response Time, and Reliability">
+Performance, Scalability, Response Time, and Reliability</a></h3>
+
+<p>
+Expanding on the
+<a href="#Performance and Scalability">earlier discussion</a>,
+RCU is used heavily by hot code paths in performance-critical
+portions of the Linux kernel's networking, security, virtualization,
+and scheduling code paths.
+RCU must therefore use efficient implementations, especially in its
+read-side primitives.
+To that end, it would be good if preemptible RCU's implementation
+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
+this requires resolving <tt>#include</tt> issues with the
+<tt>task_struct</tt> structure.
+
+<p>
+The Linux kernel supports hardware configurations with up to
+4096 CPUs, which means that RCU must be extremely scalable.
+Algorithms that involve frequent acquisitions of global locks or
+frequent atomic operations on global variables simply cannot be
+tolerated within the RCU implementation.
+RCU therefore makes heavy use of a combining tree based on the
+<tt>rcu_node</tt> structure.
+RCU is required to tolerate all CPUs continuously invoking any
+combination of RCU's runtime primitives with minimal per-operation
+overhead.
+In fact, in many cases, increasing load must <i>decrease</i> the
+per-operation overhead, witness the batching optimizations for
+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
+As a general rule, RCU must cheerfully accept whatever the
+rest of the Linux kernel decides to throw at it.
+
+<p>
+The Linux kernel is used for real-time workloads, especially
+in conjunction with the
+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
+The real-time-latency response requirements are such that the
+traditional approach of disabling preemption across RCU
+read-side critical sections is inappropriate.
+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
+use an RCU implementation that allows RCU read-side critical
+sections to be preempted.
+This requirement made its presence known after users made it
+clear that an earlier
+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
+did not meet their needs, in conjunction with some
+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
+encountered by a very early version of the -rt patchset.
+
+<p>
+In addition, RCU must make do with a sub-100-microsecond real-time latency
+budget.
+In fact, on smaller systems with the -rt patchset, the Linux kernel
+provides sub-20-microsecond real-time latencies for the whole kernel,
+including RCU.
+RCU's scalability and latency must therefore be sufficient for
+these sorts of configurations.
+To my surprise, the sub-100-microsecond real-time latency budget
+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
+applies to even the largest systems [PDF]</a>,
+up to and including systems with 4096 CPUs.
+This real-time requirement motivated the grace-period kthread, which
+also simplified handling of a number of race conditions.
+
+<p>
+Finally, RCU's status as a synchronization primitive means that
+any RCU failure can result in arbitrary memory corruption that can be
+extremely difficult to debug.
+This means that RCU must be extremely reliable, which in
+practice also means that RCU must have an aggressive stress-test
+suite.
+This stress-test suite is called <tt>rcutorture</tt>.
+
+<p>
+Although the need for <tt>rcutorture</tt> was no surprise,
+the current immense popularity of the Linux kernel is posing
+interesting—and perhaps unprecedented—validation
+challenges.
+To see this, keep in mind that there are well over one billion
+instances of the Linux kernel running today, given Android
+smartphones, Linux-powered televisions, and servers.
+This number can be expected to increase sharply with the advent of
+the celebrated Internet of Things.
+
+<p>
+Suppose that RCU contains a race condition that manifests on average
+once per million years of runtime.
+This bug will be occurring about three times per <i>day</i> across
+the installed base.
+RCU could simply hide behind hardware error rates, given that no one
+should really expect their smartphone to last for a million years.
+However, anyone taking too much comfort from this thought should
+consider the fact that in most jurisdictions, a successful multi-year
+test of a given mechanism, which might include a Linux kernel,
+suffices for a number of types of safety-critical certifications.
+In fact, rumor has it that the Linux kernel is already being used
+in production for safety-critical applications.
+I don't know about you, but I would feel quite bad if a bug in RCU
+killed someone.
+Which might explain my recent focus on validation and verification.
+
+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
+
+<p>
+One of the more surprising things about RCU is that there are now
+no fewer than five <i>flavors</i>, or API families.
+In addition, the primary flavor that has been the sole focus up to
+this point has two different implementations, non-preemptible and
+preemptible.
+The other four flavors are listed below, with requirements for each
+described in a separate section.
+
+<ol>
+<li> <a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
+<li> <a href="#Sched Flavor">Sched Flavor</a>
+<li> <a href="#Sleepable RCU">Sleepable RCU</a>
+<li> <a href="#Tasks RCU">Tasks RCU</a>
+</ol>
+
+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
+
+<p>
+The softirq-disable (AKA “bottom-half”,
+hence the “_bh” abbreviations)
+flavor of RCU, or <i>RCU-bh</i>, was developed by
+Dipankar Sarma to provide a flavor of RCU that could withstand the
+network-based denial-of-service attacks researched by Robert
+Olsson.
+These attacks placed so much networking load on the system
+that some of the CPUs never exited softirq execution,
+which in turn prevented those CPUs from ever executing a context switch,
+which, in the RCU implementation of that time, prevented grace periods
+from ever ending.
+The result was an out-of-memory condition and a system hang.
+
+<p>
+The solution was the creation of RCU-bh, which does
+<tt>local_bh_disable()</tt>
+across its read-side critical sections, and which uses the transition
+from one type of softirq processing to another as a quiescent state
+in addition to context switch, idle, user mode, and offline.
+This means that RCU-bh grace periods can complete even when some of
+the CPUs execute in softirq indefinitely, thus allowing algorithms
+based on RCU-bh to withstand network-based denial-of-service attacks.
+
+<p>
+Because
+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
+disable and re-enable softirq handlers, any attempt to start a softirq
+handlers during the
+RCU-bh read-side critical section will be deferred.
+In this case, <tt>rcu_read_unlock_bh()</tt>
+will invoke softirq processing, which can take considerable time.
+One can of course argue that this softirq overhead should be associated
+with the code following the RCU-bh read-side critical section rather
+than <tt>rcu_read_unlock_bh()</tt>, but the fact
+is that most profiling tools cannot be expected to make this sort
+of fine distinction.
+For example, suppose that a three-millisecond-long RCU-bh read-side
+critical section executes during a time of heavy networking load.
+There will very likely be an attempt to invoke at least one softirq
+handler during that three milliseconds, but any such invocation will
+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
+This can of course make it appear at first glance as if
+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
+includes
+<tt>rcu_read_lock_bh()</tt>,
+<tt>rcu_read_unlock_bh()</tt>,
+<tt>rcu_dereference_bh()</tt>,
+<tt>rcu_dereference_bh_check()</tt>,
+<tt>synchronize_rcu_bh()</tt>,
+<tt>synchronize_rcu_bh_expedited()</tt>,
+<tt>call_rcu_bh()</tt>,
+<tt>rcu_barrier_bh()</tt>, and
+<tt>rcu_read_lock_bh_held()</tt>.
+
+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
+
+<p>
+Before preemptible RCU, waiting for an RCU grace period had the
+side effect of also waiting for all pre-existing interrupt
+and NMI handlers.
+However, there are legitimate preemptible-RCU implementations that
+do not have this property, given that any point in the code outside
+of an RCU read-side critical section can be a quiescent state.
+Therefore, <i>RCU-sched</i> was created, which follows “classic”
+RCU in that an RCU-sched grace period waits for for pre-existing
+interrupt and NMI handlers.
+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
+APIs have identical implementations, while kernels built with
+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
+
+<p>
+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
+disable and re-enable preemption, respectively.
+This means that if there was a preemption attempt during the
+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
+will enter the scheduler, with all the latency and overhead entailed.
+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
+However, the highest-priority task won't be preempted, so that task
+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
+includes
+<tt>rcu_read_lock_sched()</tt>,
+<tt>rcu_read_unlock_sched()</tt>,
+<tt>rcu_read_lock_sched_notrace()</tt>,
+<tt>rcu_read_unlock_sched_notrace()</tt>,
+<tt>rcu_dereference_sched()</tt>,
+<tt>rcu_dereference_sched_check()</tt>,
+<tt>synchronize_sched()</tt>,
+<tt>synchronize_rcu_sched_expedited()</tt>,
+<tt>call_rcu_sched()</tt>,
+<tt>rcu_barrier_sched()</tt>, and
+<tt>rcu_read_lock_sched_held()</tt>.
+However, anything that disables preemption also marks an RCU-sched
+read-side critical section, including
+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
+and so on.
+
+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
+
+<p>
+For well over a decade, someone saying “I need to block within
+an RCU read-side critical section” was a reliable indication
+that this someone did not understand RCU.
+After all, if you are always blocking in an RCU read-side critical
+section, you can probably afford to use a higher-overhead synchronization
+mechanism.
+However, that changed with the advent of the Linux kernel's notifiers,
+whose RCU read-side critical
+sections almost never sleep, but sometimes need to.
+This resulted in the introduction of
+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
+or <i>SRCU</i>.
+
+<p>
+SRCU allows different domains to be defined, with each such domain
+defined by an instance of an <tt>srcu_struct</tt> structure.
+A pointer to this structure must be passed in to each SRCU function,
+for example, <tt>synchronize_srcu(&ss)</tt>, where
+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
+The key benefit of these domains is that a slow SRCU reader in one
+domain does not delay an SRCU grace period in some other domain.
+That said, one consequence of these domains is that read-side code
+must pass a “cookie” from <tt>srcu_read_lock()</tt>
+to <tt>srcu_read_unlock()</tt>, for example, as follows:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 srcu_read_unlock(&ss, idx);
+</pre>
+</blockquote>
+
+<p>
+As noted above, it is legal to block within SRCU read-side critical sections,
+however, with great power comes great responsibility.
+If you block forever in one of a given domain's SRCU read-side critical
+sections, then that domain's grace periods will also be blocked forever.
+Of course, one good way to block forever is to deadlock, which can
+happen if any operation in a given domain's SRCU read-side critical
+section can block waiting, either directly or indirectly, for that domain's
+grace period to elapse.
+For example, this results in a self-deadlock:
+
+<blockquote>
+<pre>
+ 1 int idx;
+ 2
+ 3 idx = srcu_read_lock(&ss);
+ 4 do_something();
+ 5 synchronize_srcu(&ss);
+ 6 srcu_read_unlock(&ss, idx);
+</pre>
+</blockquote>
+
+<p>
+However, if line 5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
+deadlock would still be possible.
+Furthermore, if line 5 acquired a mutex that was held across
+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
+and if an <tt>ss1</tt>-domain SRCU read-side critical section
+acquired another mutex that was held across as <tt>ss</tt>-domain
+<tt>synchronize_srcu()</tt>,
+deadlock would again be possible.
+Such a deadlock cycle could extend across an arbitrarily large number
+of different SRCU domains.
+Again, with great power comes great responsibility.
+
+<p>
+Unlike the other RCU flavors, SRCU read-side critical sections can
+run on idle and even offline CPUs.
+This ability requires that <tt>srcu_read_lock()</tt> and
+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
+that SRCU readers will run a bit slower than would RCU readers.
+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
+API, which, in combination with <tt>srcu_read_unlock()</tt>,
+guarantees a full memory barrier.
+
+<p>
+The
+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
+includes
+<tt>srcu_read_lock()</tt>,
+<tt>srcu_read_unlock()</tt>,
+<tt>srcu_dereference()</tt>,
+<tt>srcu_dereference_check()</tt>,
+<tt>synchronize_srcu()</tt>,
+<tt>synchronize_srcu_expedited()</tt>,
+<tt>call_srcu()</tt>,
+<tt>srcu_barrier()</tt>, and
+<tt>srcu_read_lock_held()</tt>.
+It also includes
+<tt>DEFINE_SRCU()</tt>,
+<tt>DEFINE_STATIC_SRCU()</tt>, and
+<tt>init_srcu_struct()</tt>
+APIs for defining and initializing <tt>srcu_struct</tt> structures.
+
+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
+
+<p>
+Some forms of tracing use “tramopolines” to handle the
+binary rewriting required to install different types of probes.
+It would be good to be able to free old trampolines, which sounds
+like a job for some form of RCU.
+However, because it is necessary to be able to install a trace
+anywhere in the code, it is not possible to use read-side markers
+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
+In addition, it does not work to have these markers in the trampoline
+itself, because there would need to be instructions following
+<tt>rcu_read_unlock()</tt>.
+Although <tt>synchronize_rcu()</tt> would guarantee that execution
+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
+guarantee that execution had completely left the trampoline.
+
+<p>
+The solution, in the form of
+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
+is to have implicit
+read-side critical sections that are delimited by voluntary context
+switches, that is, calls to <tt>schedule()</tt>,
+<tt>cond_resched_rcu_qs()</tt>, and
+<tt>synchronize_rcu_tasks()</tt>.
+In addition, transitions to and from userspace execution also delimit
+tasks-RCU read-side critical sections.
+
+<p>
+The tasks-RCU API is quite compact, consisting only of
+<tt>call_rcu_tasks()</tt>,
+<tt>synchronize_rcu_tasks()</tt>, and
+<tt>rcu_barrier_tasks()</tt>.
+
+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
+
+<p>
+One of the tricks that RCU uses to attain update-side scalability is
+to increase grace-period latency with increasing numbers of CPUs.
+If this becomes a serious problem, it will be necessary to rework the
+grace-period state machine so as to avoid the need for the additional
+latency.
+
+<p>
+Expedited grace periods scan the CPUs, so their latency and overhead
+increases with increasing numbers of CPUs.
+If this becomes a serious problem on large systems, it will be necessary
+to do some redesign to avoid this scalability problem.
+
+<p>
+RCU disables CPU hotplug in a few places, perhaps most notably in the
+expedited grace-period and <tt>rcu_barrier()</tt> operations.
+If there is a strong reason to use expedited grace periods in CPU-hotplug
+notifiers, it will be necessary to avoid disabling CPU hotplug.
+This would introduce some complexity, so there had better be a <i>very</i>
+good reason.
+
+<p>
+The tradeoff between grace-period latency on the one hand and interruptions
+of other CPUs on the other hand may need to be re-examined.
+The desire is of course for zero grace-period latency as well as zero
+interprocessor interrupts undertaken during an expedited grace period
+operation.
+While this ideal is unlikely to be achievable, it is quite possible that
+further improvements can be made.
+
+<p>
+The multiprocessor implementations of RCU use a combining tree that
+groups CPUs so as to reduce lock contention and increase cache locality.
+However, this combining tree does not spread its memory across NUMA
+nodes nor does it align the CPU groups with hardware features such
+as sockets or cores.
+Such spreading and alignment is currently believed to be unnecessary
+because the hotpath read-side primitives do not access the combining
+tree, nor does <tt>call_rcu()</tt> in the common case.
+If you believe that your architecture needs such spreading and alignment,
+then your architecture should also benefit from the
+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
+to the number of CPUs in a socket, NUMA node, or whatever.
+If the number of CPUs is too large, use a fraction of the number of
+CPUs.
+If the number of CPUs is a large prime number, well, that certainly
+is an “interesting” architectural choice!
+More flexible arrangements might be considered, but only if
+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
+if the inadequacy has been demonstrated by a carefully run and
+realistic system-level workload.
+
+<p>
+Please note that arrangements that require RCU to remap CPU numbers will
+require extremely good demonstration of need and full exploration of
+alternatives.
+
+<p>
+There is an embarrassingly large number of flavors of RCU, and this
+number has been increasing over time.
+Perhaps it will be possible to combine some at some future date.
+
+<p>
+RCU's various kthreads are reasonably recent additions.
+It is quite likely that adjustments will be required to more gracefully
+handle extreme loads.
+It might also be necessary to be able to relate CPU utilization by
+RCU's kthreads and softirq handlers to the code that instigated this
+CPU utilization.
+For example, RCU callback overhead might be charged back to the
+originating <tt>call_rcu()</tt> instance, though probably not
+in production kernels.
+
+<h2><a name="Summary">Summary</a></h2>
+
+<p>
+This document has presented more than two decade's worth of RCU
+requirements.
+Given that the requirements keep changing, this will not be the last
+word on this subject, but at least it serves to get an important
+subset of the requirements set forth.
+
+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
+
+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
+Andy Lutomirski for their help in rendering
+this article human readable, and to Michelle Rankin for her support
+of this effort.
+Other contributions are acknowledged in the Linux kernel's git archive.
+The cartoon is copyright (c) 2013 by Melissa Broussard,
+and is provided
+under the terms of the Creative Commons Attribution-Share Alike 3.0
+United States license.
+
+<p>@@QQAL@@
+
+</body></html>
--- /dev/null
+#!/bin/sh
+#
+# Usage: sh htmlqqz.sh file
+#
+# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
+# Commands, all of which must be on a line by themselves:
+#
+# "<p>@@QQ@@": Start of a quick quiz.
+# "<p>@@QQA@@": Start of a quick-quiz answer.
+# "<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
+# "<p>@@QQAL@@": Place to put quick-quiz answer list.
+#
+# Places the result in file.html.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, you can access it online at
+# http://www.gnu.org/licenses/gpl-2.0.html.
+#
+# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
+
+fn=$1
+if test ! -r $fn.htmlx
+then
+ echo "Error: $fn.htmlx unreadable."
+ exit 1
+fi
+
+echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
+echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
+awk < $fn.htmlx >> $fn.html '
+
+state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
+ print $0;
+ if ($0 ~ /^<p>@@QQ/)
+ print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
+ next;
+}
+
+state == "" && $1 == "<p>@@QQ@@" {
+ qqn++;
+ qqlineno = NR;
+ haveqq = 1;
+ state = "qq";
+ print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
+ next;
+}
+
+state == "qq" && $1 != "<p>@@QQA@@" {
+ qq[qqn] = qq[qqn] $0 "\n";
+ print $0
+ if ($0 ~ /^<p>@@QQ/)
+ print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
+ next;
+}
+
+state == "qq" && $1 == "<p>@@QQA@@" {
+ state = "qqa";
+ print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
+ next;
+}
+
+state == "qqa" && $1 != "<p>@@QQE@@" {
+ qqa[qqn] = qqa[qqn] $0 "\n";
+ if ($0 ~ /^<p>@@QQ/)
+ print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
+ next;
+}
+
+state == "qqa" && $1 == "<p>@@QQE@@" {
+ state = "";
+ next;
+}
+
+state == "" && $1 == "<p>@@QQAL@@" {
+ haveqq = "";
+ print "<h3><a name=\"Answers to Quick Quizzes\">"
+ print "Answers to Quick Quizzes</a></h3>"
+ print "";
+ for (i = 1; i <= qqn; i++) {
+ print "<a name=\"qq" i "answer\"></a>"
+ print "<p><b>Quick Quiz " i "</b>:"
+ print qq[i];
+ print "";
+ print "</p><p><b>Answer</b>:"
+ print qqa[i];
+ print "";
+ print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
+ print "";
+ }
+ next;
+}
+
+END {
+ if (state != "")
+ print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
+ else if (haveqq)
+ print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
+}'
Optional properties:
- ti,hwmods: Name of the hwmods associated to the eDMA CC
- ti,edma-memcpy-channels: List of channels allocated to be used for memcpy, iow
- these channels will be SW triggered channels. The list must
- contain 16 bits numbers, see example.
+ these channels will be SW triggered channels. See example.
- ti,edma-reserved-slot-ranges: PaRAM slot ranges which should not be used by
the driver, they are allocated to be used by for example the
DSP. See example.
ti,tptcs = <&edma_tptc0 7>, <&edma_tptc1 7>, <&edma_tptc2 0>;
/* Channel 20 and 21 is allocated for memcpy */
- ti,edma-memcpy-channels = /bits/ 16 <20 21>;
- /* The following PaRAM slots are reserved: 35-45 and 100-110 */
- ti,edma-reserved-slot-ranges = /bits/ 16 <35 10>,
- /bits/ 16 <100 10>;
+ ti,edma-memcpy-channels = <20 21>;
+ /* The following PaRAM slots are reserved: 35-44 and 100-109 */
+ ti,edma-reserved-slot-ranges = <35 10>, <100 10>;
};
edma_tptc0: tptc@49800000 {
Required subnode-properties:
- label: Descriptive name of the key.
- linux,code: Keycode to emit.
- - channel: Channel this key is attached to, mut be 0 or 1.
+ - channel: Channel this key is attached to, must be 0 or 1.
- voltage: Voltage in µV at lradc input when this key is pressed.
Example:
as RedBoot.
The partition table should be a subnode of the mtd node and should be named
-'partitions'. Partitions are defined in subnodes of the partitions node.
+'partitions'. This node should have the following property:
+- compatible : (required) must be "fixed-partitions"
+Partitions are then defined in subnodes of the partitions node.
For backwards compatibility partitions as direct subnodes of the mtd device are
supported. This use is discouraged.
flash@0 {
partitions {
+ compatible = "fixed-partitions";
#address-cells = <1>;
#size-cells = <1>;
flash@1 {
partitions {
+ compatible = "fixed-partitions";
#address-cells = <1>;
#size-cells = <2>;
flash@2 {
partitions {
+ compatible = "fixed-partitions";
#address-cells = <2>;
#size-cells = <2>;
Slave Properties:
Required properties:
-- phy_id : Specifies slave phy id
- phy-mode : See ethernet.txt file in the same directory
Optional properties:
- dual_emac_res_vlan : Specifies VID to be used to segregate the ports
- mac-address : See ethernet.txt file in the same directory
+- phy_id : Specifies slave phy id
- phy-handle : See ethernet.txt file in the same directory
Slave sub-nodes:
- fixed-link : See fixed-link.txt file in the same directory
- Either the properties phy_id and phy-mode,
- or the sub-node fixed-link can be specified
+ Either the property phy_id, or the sub-node
+ fixed-link can be specified
Note: "ti,hwmods" field is used to fetch the base address and irq
resources from TI, omap hwmod data base during device registration.
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- const char *(*follow_link) (struct dentry *, void **);
- void (*put_link) (struct inode *, void *);
+ const char *(*get_link) (struct dentry *, struct inode *, void **);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, unsigned int);
int (*get_acl)(struct inode *, int);
rename: yes (all) (see below)
rename2: yes (all) (see below)
readlink: no
-follow_link: no
-put_link: no
+get_link: no
setattr: yes
permission: no (may not block if called in rcu-walk mode)
get_acl: no
[mandatory]
__fd_install() & fd_install() can now sleep. Callers should not
hold a spinlock or other resources that do not allow a schedule.
+--
+[mandatory]
+ any symlink that might use page_follow_link_light/page_put_link() must
+ have inode_nohighmem(inode) called before anything might start playing with
+ its pagecache.
+--
+[mandatory]
+ ->follow_link() is replaced with ->get_link(); same API, except that
+ * ->get_link() gets inode as a separate argument
+ * ->get_link() may be called in RCU mode - in that case NULL
+ dentry is passed
+--
+[mandatory]
+ ->get_link() gets struct delayed_call *done now, and should do
+ set_delayed_call() where it used to set *cookie.
+ ->put_link() is gone - just give the destructor to set_delayed_call()
+ in ->get_link().
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- const char *(*follow_link) (struct dentry *, void **);
- void (*put_link) (struct inode *, void *);
+ const char *(*get_link) (struct dentry *, struct inode *,
+ struct delayed_call *);
int (*permission) (struct inode *, int);
int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
readlink: called by the readlink(2) system call. Only required if
you want to support reading symbolic links
- follow_link: called by the VFS to follow a symbolic link to the
+ get_link: called by the VFS to follow a symbolic link to the
inode it points to. Only required if you want to support
symbolic links. This method returns the symlink body
to traverse (and possibly resets the current position with
nd_jump_link()). If the body won't go away until the inode
is gone, nothing else is needed; if it needs to be otherwise
- pinned, the data needed to release whatever we'd grabbed
- is to be stored in void * variable passed by address to
- follow_link() instance.
-
- put_link: called by the VFS to release resources allocated by
- follow_link(). The cookie stored by follow_link() is passed
- to this method as the last parameter; only called when
- cookie isn't NULL.
+ pinned, arrange for its release by having get_link(..., ..., done)
+ do set_delayed_call(done, destructor, argument).
+ In that case destructor(argument) will be called once VFS is
+ done with the body you've returned.
+ May be called in RCU mode; that is indicated by NULL dentry
+ argument. If request can't be handled without leaving RCU mode,
+ have it return ERR_PTR(-ECHILD).
permission: called by the VFS to check for access rights on a POSIX-like
filesystem.
rcutorture.verbose= [KNL]
Enable additional printk() statements.
+ rcupdate.rcu_cpu_stall_suppress= [KNL]
+ Suppress RCU CPU stall warning messages.
+
+ rcupdate.rcu_cpu_stall_timeout= [KNL]
+ Set timeout for RCU CPU stall warning messages.
+
rcupdate.rcu_expedited= [KNL]
Use expedited grace-period primitives, for
example, synchronize_rcu_expedited() instead
of synchronize_rcu(). This reduces latency,
but can increase CPU utilization, degrade
real-time latency, and degrade energy efficiency.
-
- rcupdate.rcu_cpu_stall_suppress= [KNL]
- Suppress RCU CPU stall warning messages.
-
- rcupdate.rcu_cpu_stall_timeout= [KNL]
- Set timeout for RCU CPU stall warning messages.
+ No effect on CONFIG_TINY_RCU kernels.
+
+ rcupdate.rcu_normal= [KNL]
+ Use only normal grace-period primitives,
+ for example, synchronize_rcu() instead of
+ synchronize_rcu_expedited(). This improves
+ real-time latency, CPU utilization, and
+ energy efficiency, but can expose users to
+ increased grace-period latency. This parameter
+ overrides rcupdate.rcu_expedited. No effect on
+ CONFIG_TINY_RCU kernels.
+
+ rcupdate.rcu_normal_after_boot= [KNL]
+ Once boot has completed (that is, after
+ rcu_end_inkernel_boot() has been invoked), use
+ only normal grace-period primitives. No effect
+ on CONFIG_TINY_RCU kernels.
rcupdate.rcu_task_stall_timeout= [KNL]
Set timeout in jiffies for RCU task stall warning
(*) On any given CPU, dependent memory accesses will be issued in order, with
respect to itself. This means that for:
- WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
+ Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
the CPU will issue the following memory operations:
and always in that order. On most systems, smp_read_barrier_depends()
does nothing, but it is required for DEC Alpha. The READ_ONCE()
- and WRITE_ONCE() are required to prevent compiler mischief. Please
- note that you should normally use something like rcu_dereference()
- instead of open-coding smp_read_barrier_depends().
+ is required to prevent compiler mischief. Please note that you
+ should normally use something like rcu_dereference() instead of
+ open-coding smp_read_barrier_depends().
(*) Overlapping loads and stores within a particular CPU will appear to be
ordered within that CPU. This means that for:
(*) smp_store_mb(var, value)
This assigns the value to the variable and then inserts a full memory
- barrier after it, depending on the function. It isn't guaranteed to
- insert anything more than a compiler barrier in a UP compilation.
+ barrier after it. It isn't guaranteed to insert anything more than a
+ compiler barrier in a UP compilation.
(*) smp_mb__before_atomic();
If an issue is identified with the released source code on the supported
kernel with a supported adapter, email the specific information related to the
issue to e1000-devel@lists.sourceforge.net.
-
-
-License
-=======
-
-This software program is released under the terms of a license agreement
-between you ('Licensee') and Intel. Do not use or load this software or any
-associated materials (collectively, the 'Software') until you have carefully
-read the full terms and conditions of the file COPYING located in this software
-package. By loading or using the Software, you agree to the terms of this
-Agreement. If you do not agree with the terms of this Agreement, do not install
-or use the Software.
-
-* Other names and brands may be claimed as the property of others.
R: Shannon Nelson <shannon.nelson@intel.com>
R: Carolyn Wyborny <carolyn.wyborny@intel.com>
R: Don Skidmore <donald.c.skidmore@intel.com>
-R: Matthew Vick <matthew.vick@intel.com>
+R: Bruce Allan <bruce.w.allan@intel.com>
R: John Ronciak <john.ronciak@intel.com>
R: Mitch Williams <mitch.a.williams@intel.com>
L: intel-wired-lan@lists.osuosl.org
S: Maintained
F: drivers/pinctrl/samsung/
+PIN CONTROLLER - SINGLE
+M: Tony Lindgren <tony@atomide.com>
+M: Haojian Zhuang <haojian.zhuang@linaro.org>
+L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+L: linux-omap@vger.kernel.org
+S: Maintained
+F: drivers/pinctrl/pinctrl-single.c
+
PIN CONTROLLER - ST SPEAR
M: Viresh Kumar <vireshk@kernel.org>
L: spear-devel@list.st.com
F: Documentation/rpmsg.txt
F: include/linux/rpmsg.h
+RENESAS ETHERNET DRIVERS
+R: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
+L: netdev@vger.kernel.org
+L: linux-sh@vger.kernel.org
+F: drivers/net/ethernet/renesas/
+F: include/linux/sh_eth.h
+
RESET CONTROLLER FRAMEWORK
M: Philipp Zabel <p.zabel@pengutronix.de>
S: Maintained
VERSION = 4
PATCHLEVEL = 4
SUBLEVEL = 0
-EXTRAVERSION = -rc5
+EXTRAVERSION =
NAME = Blurry Fish Butt
# *DOCUMENTATION*
However some customers have peripherals mapped at this addr, so
Linux needs to be scooted a bit.
If you don't know what the above means, leave this setting alone.
+ This needs to match memory start address specified in Device Tree
config HIGHMEM
bool "High Memory Support"
LIBGCC := $(shell $(CC) $(cflags-y) --print-libgcc-file-name)
# Modules with short calls might break for calls into builtin-kernel
-KBUILD_CFLAGS_MODULE += -mlong-calls
+KBUILD_CFLAGS_MODULE += -mlong-calls -mno-millicode
# Finally dump eveything into kernel build system
KBUILD_CFLAGS += $(cflags-y)
snps,pbl = < 32 >;
clocks = <&apbclk>;
clock-names = "stmmaceth";
+ max-speed = <100>;
};
ehci@0x40000 {
memory {
device_type = "memory";
- reg = <0x0 0x80000000 0x0 0x40000000 /* 1 GB low mem */
+ /* CONFIG_LINUX_LINK_BASE needs to match low mem start */
+ reg = <0x0 0x80000000 0x0 0x20000000 /* 512 MB low mem */
0x1 0x00000000 0x0 0x40000000>; /* 1 GB highmem */
};
#define ARC_REG_IC_IVIC 0x10
#define ARC_REG_IC_CTRL 0x11
#define ARC_REG_IC_IVIL 0x19
-#if defined(CONFIG_ARC_MMU_V3) || defined(CONFIG_ARC_MMU_V4)
#define ARC_REG_IC_PTAG 0x1E
-#endif
#define ARC_REG_IC_PTAG_HI 0x1F
/* Bit val in IC_CTRL */
* @dt_compat: Array of device tree 'compatible' strings
* (XXX: although only 1st entry is looked at)
* @init_early: Very early callback [called from setup_arch()]
- * @init_cpu_smp: for each CPU as it is coming up (SMP as well as UP)
+ * @init_per_cpu: for each CPU as it is coming up (SMP as well as UP)
* [(M):init_IRQ(), (o):start_kernel_secondary()]
* @init_machine: arch initcall level callback (e.g. populate static
* platform devices or parse Devicetree)
const char **dt_compat;
void (*init_early)(void);
#ifdef CONFIG_SMP
- void (*init_cpu_smp)(unsigned int);
+ void (*init_per_cpu)(unsigned int);
#endif
void (*init_machine)(void);
void (*init_late)(void);
* @init_early_smp: A SMP specific h/w block can init itself
* Could be common across platforms so not covered by
* mach_desc->init_early()
- * @init_irq_cpu: Called for each core so SMP h/w block driver can do
+ * @init_per_cpu: Called for each core so SMP h/w block driver can do
* any needed setup per cpu (e.g. IPI request)
* @cpu_kick: For Master to kickstart a cpu (optionally at a PC)
* @ipi_send: To send IPI to a @cpu
struct plat_smp_ops {
const char *info;
void (*init_early_smp)(void);
- void (*init_irq_cpu)(int cpu);
+ void (*init_per_cpu)(int cpu);
void (*cpu_kick)(int cpu, unsigned long pc);
void (*ipi_send)(int cpu);
void (*ipi_clear)(int irq);
extern int arc_unwind(struct unwind_frame_info *frame);
extern void arc_unwind_init(void);
-extern void arc_unwind_setup(void);
extern void *unwind_add_table(struct module *module, const void *table_start,
unsigned long table_size);
extern void unwind_remove_table(void *handle, int init_only);
{
}
-static inline void arc_unwind_setup(void)
-{
-}
#define unwind_add_table(a, b, c)
#define unwind_remove_table(a, b)
static int arcv2_irq_map(struct irq_domain *d, unsigned int irq,
irq_hw_number_t hw)
{
- if (irq == TIMER0_IRQ || irq == IPI_IRQ)
+ /*
+ * core intc IRQs [16, 23]:
+ * Statically assigned always private-per-core (Timers, WDT, IPI, PCT)
+ */
+ if (hw < 24) {
+ /*
+ * A subsequent request_percpu_irq() fails if percpu_devid is
+ * not set. That in turns sets NOAUTOEN, meaning each core needs
+ * to call enable_percpu_irq()
+ */
+ irq_set_percpu_devid(irq);
irq_set_chip_and_handler(irq, &arcv2_irq_chip, handle_percpu_irq);
- else
+ } else {
irq_set_chip_and_handler(irq, &arcv2_irq_chip, handle_level_irq);
+ }
return 0;
}
#ifdef CONFIG_SMP
/* a SMP H/w block could do IPI IRQ request here */
- if (plat_smp_ops.init_irq_cpu)
- plat_smp_ops.init_irq_cpu(smp_processor_id());
+ if (plat_smp_ops.init_per_cpu)
+ plat_smp_ops.init_per_cpu(smp_processor_id());
- if (machine_desc->init_cpu_smp)
- machine_desc->init_cpu_smp(smp_processor_id());
+ if (machine_desc->init_per_cpu)
+ machine_desc->init_per_cpu(smp_processor_id());
#endif
}
set_irq_regs(old_regs);
}
+/*
+ * API called for requesting percpu interrupts - called by each CPU
+ * - For boot CPU, actually request the IRQ with genirq core + enables
+ * - For subsequent callers only enable called locally
+ *
+ * Relies on being called by boot cpu first (i.e. request called ahead) of
+ * any enable as expected by genirq. Hence Suitable only for TIMER, IPI
+ * which are guaranteed to be setup on boot core first.
+ * Late probed peripherals such as perf can't use this as there no guarantee
+ * of being called on boot CPU first.
+ */
+
void arc_request_percpu_irq(int irq, int cpu,
irqreturn_t (*isr)(int irq, void *dev),
const char *irq_nm,
if (!cpu) {
int rc;
+#ifdef CONFIG_ISA_ARCOMPACT
/*
- * These 2 calls are essential to making percpu IRQ APIs work
- * Ideally these details could be hidden in irq chip map function
- * but the issue is IPIs IRQs being static (non-DT) and platform
- * specific, so we can't identify them there.
+ * A subsequent request_percpu_irq() fails if percpu_devid is
+ * not set. That in turns sets NOAUTOEN, meaning each core needs
+ * to call enable_percpu_irq()
+ *
+ * For ARCv2, this is done in irq map function since we know
+ * which irqs are strictly per cpu
*/
irq_set_percpu_devid(irq);
- irq_modify_status(irq, IRQ_NOAUTOEN, 0); /* @irq, @clr, @set */
+#endif
rc = request_percpu_irq(irq, isr, irq_nm, percpu_dev);
if (rc)
struct plat_smp_ops plat_smp_ops = {
.info = smp_cpuinfo_buf,
.init_early_smp = mcip_probe_n_setup,
- .init_irq_cpu = mcip_setup_per_cpu,
+ .init_per_cpu = mcip_setup_per_cpu,
.ipi_send = mcip_ipi_send,
.ipi_clear = mcip_ipi_clear,
};
#endif /* CONFIG_ISA_ARCV2 */
-void arc_cpu_pmu_irq_init(void)
+static void arc_cpu_pmu_irq_init(void *data)
{
- struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+ int irq = *(int *)data;
- arc_request_percpu_irq(arc_pmu->irq, smp_processor_id(), arc_pmu_intr,
- "ARC perf counters", pmu_cpu);
+ enable_percpu_irq(irq, IRQ_TYPE_NONE);
/* Clear all pending interrupt flags */
write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
if (has_interrupts) {
int irq = platform_get_irq(pdev, 0);
- unsigned long flags;
if (irq < 0) {
pr_err("Cannot get IRQ number for the platform\n");
arc_pmu->irq = irq;
- /*
- * arc_cpu_pmu_irq_init() needs to be called on all cores for
- * their respective local PMU.
- * However we use opencoded on_each_cpu() to ensure it is called
- * on core0 first, so that arc_request_percpu_irq() sets up
- * AUTOEN etc. Otherwise enable_percpu_irq() fails to enable
- * perf IRQ on non master cores.
- * see arc_request_percpu_irq()
- */
- preempt_disable();
- local_irq_save(flags);
- arc_cpu_pmu_irq_init();
- local_irq_restore(flags);
- smp_call_function((smp_call_func_t)arc_cpu_pmu_irq_init, 0, 1);
- preempt_enable();
-
- /* Clean all pending interrupt flags */
- write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
+ /* intc map function ensures irq_set_percpu_devid() called */
+ request_percpu_irq(irq, arc_pmu_intr, "ARC perf counters",
+ this_cpu_ptr(&arc_pmu_cpu));
+
+ on_each_cpu(arc_cpu_pmu_irq_init, &irq, 1);
+
} else
arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
#endif
arc_unwind_init();
- arc_unwind_setup();
}
static int __init customize_machine(void)
pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu);
/* Some SMP H/w setup - for each cpu */
- if (plat_smp_ops.init_irq_cpu)
- plat_smp_ops.init_irq_cpu(cpu);
+ if (plat_smp_ops.init_per_cpu)
+ plat_smp_ops.init_per_cpu(cpu);
- if (machine_desc->init_cpu_smp)
- machine_desc->init_cpu_smp(cpu);
+ if (machine_desc->init_per_cpu)
+ machine_desc->init_per_cpu(cpu);
arc_local_timer_setup();
static unsigned long read_pointer(const u8 **pLoc,
const void *end, signed ptrType);
+static void init_unwind_hdr(struct unwind_table *table,
+ void *(*alloc) (unsigned long));
+
+/*
+ * wrappers for header alloc (vs. calling one vs. other at call site)
+ * to elide section mismatches warnings
+ */
+static void *__init unw_hdr_alloc_early(unsigned long sz)
+{
+ return __alloc_bootmem_nopanic(sz, sizeof(unsigned int),
+ MAX_DMA_ADDRESS);
+}
+
+static void *unw_hdr_alloc(unsigned long sz)
+{
+ return kmalloc(sz, GFP_KERNEL);
+}
static void init_unwind_table(struct unwind_table *table, const char *name,
const void *core_start, unsigned long core_size,
__start_unwind, __end_unwind - __start_unwind,
NULL, 0);
/*__start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);*/
+
+ init_unwind_hdr(&root_table, unw_hdr_alloc_early);
}
static const u32 bad_cie, not_fde;
e2->fde = v;
}
-static void __init setup_unwind_table(struct unwind_table *table,
- void *(*alloc) (unsigned long))
+static void init_unwind_hdr(struct unwind_table *table,
+ void *(*alloc) (unsigned long))
{
const u8 *ptr;
unsigned long tableSize = table->size, hdrSize;
if (cie == ¬_fde)
continue;
if (cie == NULL || cie == &bad_cie)
- return;
+ goto ret_err;
ptrType = fde_pointer_type(cie);
if (ptrType < 0)
- return;
+ goto ret_err;
ptr = (const u8 *)(fde + 2);
if (!read_pointer(&ptr, (const u8 *)(fde + 1) + *fde,
}
if (tableSize || !n)
- return;
+ goto ret_err;
hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
+ 2 * n * sizeof(unsigned long);
+
header = alloc(hdrSize);
if (!header)
- return;
+ goto ret_err;
+
header->version = 1;
header->eh_frame_ptr_enc = DW_EH_PE_abs | DW_EH_PE_native;
header->fde_count_enc = DW_EH_PE_abs | DW_EH_PE_data4;
table->hdrsz = hdrSize;
smp_wmb();
table->header = (const void *)header;
-}
-
-static void *__init balloc(unsigned long sz)
-{
- return __alloc_bootmem_nopanic(sz,
- sizeof(unsigned int),
- __pa(MAX_DMA_ADDRESS));
-}
+ return;
-void __init arc_unwind_setup(void)
-{
- setup_unwind_table(&root_table, balloc);
+ret_err:
+ panic("Attention !!! Dwarf FDE parsing errors\n");;
}
#ifdef CONFIG_MODULES
table_start, table_size,
NULL, 0);
+ init_unwind_hdr(table, unw_hdr_alloc);
+
#ifdef UNWIND_DEBUG
unw_debug("Table added for [%s] %lx %lx\n",
module->name, table->core.pc, table->core.range);
info.init_only = init_only;
unlink_table(&info); /* XXX: SMP */
+ kfree(table->header);
kfree(table);
}
const u8 *ptr = (const u8 *)(cie + 2);
unsigned version = *ptr;
- if (version != 1)
- return -1; /* unsupported */
-
if (*++ptr) {
const char *aug;
const u8 *end = (const u8 *)(cie + 1) + *cie;
ptr = (const u8 *)(cie + 2);
end = (const u8 *)(cie + 1) + *cie;
frame->call_frame = 1;
- if ((state.version = *ptr) != 1)
- cie = NULL; /* unsupported version */
- else if (*++ptr) {
+ if (*++ptr) {
/* check if augmentation size is first (thus present) */
if (*ptr == 'z') {
while (++ptr < end && *ptr) {
}
EXPORT_SYMBOL(__kunmap_atomic);
-noinline pte_t *alloc_kmap_pgtable(unsigned long kvaddr)
+static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr)
{
pgd_t *pgd_k;
pud_t *pud_k;
return pte_k;
}
-void kmap_init(void)
+void __init kmap_init(void)
{
/* Due to recursive include hell, we can't do this in processor.h */
BUILD_BUG_ON(PAGE_OFFSET < (VMALLOC_END + FIXMAP_SIZE + PKMAP_SIZE));
int in_use = 0;
if (!low_mem_sz) {
- BUG_ON(base != low_mem_start);
+ if (base != low_mem_start)
+ panic("CONFIG_LINUX_LINK_BASE != DT memory { }");
+
low_mem_sz = size;
in_use = 1;
} else {
&fec {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet>;
- phy-mode = "rgmii";
+ phy-mode = "rgmii-id";
phy-reset-gpios = <&gpio1 30 GPIO_ACTIVE_HIGH>;
status = "okay";
};
&fec {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet>;
- phy-mode = "rgmii";
+ phy-mode = "rgmii-id";
phy-reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
status = "okay";
};
&fec {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet>;
- phy-mode = "rgmii";
+ phy-mode = "rgmii-id";
phy-reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
status = "okay";
};
&fec {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet>;
- phy-mode = "rgmii";
+ phy-mode = "rgmii-id";
phy-reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
status = "okay";
};
&fec {
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_enet>;
- phy-mode = "rgmii";
+ phy-mode = "rgmii-id";
phy-reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
status = "okay";
};
&clks {
assigned-clocks = <&clks IMX6QDL_PLL4_BYPASS_SRC>,
<&clks IMX6QDL_PLL4_BYPASS>,
- <&clks IMX6QDL_CLK_PLL4_POST_DIV>,
<&clks IMX6QDL_CLK_LDB_DI0_SEL>,
- <&clks IMX6QDL_CLK_LDB_DI1_SEL>;
+ <&clks IMX6QDL_CLK_LDB_DI1_SEL>,
+ <&clks IMX6QDL_CLK_PLL4_POST_DIV>;
assigned-clock-parents = <&clks IMX6QDL_CLK_LVDS2_IN>,
<&clks IMX6QDL_PLL4_BYPASS_SRC>,
<&clks IMX6QDL_CLK_PLL3_USB_OTG>,
<&clks IMX6QDL_CLK_PLL3_USB_OTG>;
- assigned-clock-rates = <0>, <0>, <24576000>;
+ assigned-clock-rates = <0>, <0>, <0>, <0>, <24576000>;
};
&ecspi1 {
};
};
+&uart3 {
+ interrupts-extended = <&wakeupgen GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH
+ &omap4_pmx_core OMAP4_UART3_RX>;
+};
cache-sets = <512>;
cache-line-size = <32>;
/* At full speed latency must be >=2 */
- arm,tag-latency = <2>;
- arm,data-latency = <2 2>;
- arm,dirty-latency = <2>;
+ arm,tag-latency = <8>;
+ arm,data-latency = <8 8>;
+ arm,dirty-latency = <8>;
};
mtu0: mtu@101e2000 {
reg = <0x5d>;
interrupt-parent = <&pio>;
interrupts = <0 3 IRQ_TYPE_LEVEL_HIGH>; /* PA3 */
+ touchscreen-swapped-x-y;
};
};
/* CPU DFLL clock */
clock@0,70110000 {
- status = "okay";
+ status = "disabled";
vdd-cpu-supply = <&vdd_cpu>;
nvidia,i2c-fs-rate = <400000>;
};
interrupt-parent = <&vic>;
interrupts = <31>; /* Cascaded to vic */
clear-mask = <0xffffffff>;
- valid-mask = <0xffc203f8>;
+ /*
+ * Valid interrupt lines mask according to
+ * table 4-36 page 4-50 of ARM DUI 0225D
+ */
+ valid-mask = <0x0760031b>;
};
dma@10130000 {
};
mmc@5000 {
compatible = "arm,pl180", "arm,primecell";
- reg = < 0x5000 0x1000>;
- interrupts-extended = <&vic 22 &sic 2>;
+ reg = <0x5000 0x1000>;
+ interrupts-extended = <&vic 22 &sic 1>;
clocks = <&xtal24mhz>, <&pclk>;
clock-names = "mclk", "apb_pclk";
};
compatible = "arm,versatile-pb";
amba {
+ /* The Versatile PB is using more SIC IRQ lines than the AB */
+ sic: intc@10003000 {
+ clear-mask = <0xffffffff>;
+ /*
+ * Valid interrupt lines mask according to
+ * figure 3-30 page 3-74 of ARM DUI 0224B
+ */
+ valid-mask = <0x7fe003ff>;
+ };
+
gpio2: gpio@101e6000 {
compatible = "arm,pl061", "arm,primecell";
reg = <0x101e6000 0x1000>;
};
fpga {
+ mmc@5000 {
+ /*
+ * Overrides the interrupt assignment from
+ * the Versatile AB board file.
+ */
+ interrupts-extended = <&sic 22 &sic 23>;
+ };
uart@9000 {
compatible = "arm,pl011", "arm,primecell";
reg = <0x9000 0x1000>;
mmc@b000 {
compatible = "arm,pl180", "arm,primecell";
reg = <0xb000 0x1000>;
- interrupts-extended = <&vic 23 &sic 2>;
+ interrupt-parent = <&sic>;
+ interrupts = <1>, <2>;
clocks = <&xtal24mhz>, <&pclk>;
clock-names = "mclk", "apb_pclk";
};
interrupts = <43>;
};
+ sdhc@d800a000 {
+ compatible = "wm,wm8505-sdhc";
+ reg = <0xd800a000 0x400>;
+ interrupts = <20>, <21>;
+ clocks = <&clksdhc>;
+ bus-width = <4>;
+ sdon-inverted;
+ };
+
fb: fb@d8050800 {
compatible = "wm,wm8505-fb";
reg = <0xd8050800 0x200>;
CONFIG_CHARGER_MAX14577=m
CONFIG_CHARGER_MAX77693=m
CONFIG_CHARGER_TPS65090=y
+CONFIG_AXP20X_POWER=m
CONFIG_POWER_RESET_AS3722=y
CONFIG_POWER_RESET_GPIO=y
CONFIG_POWER_RESET_GPIO_RESTART=y
CONFIG_SPI_SUN6I=y
CONFIG_GPIO_SYSFS=y
CONFIG_POWER_SUPPLY=y
+CONFIG_AXP20X_POWER=y
CONFIG_THERMAL=y
CONFIG_CPU_THERMAL=y
CONFIG_WATCHDOG=y
static inline unsigned long __must_check
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
+#ifndef CONFIG_UACCESS_WITH_MEMCPY
unsigned int __ua_flags = uaccess_save_and_enable();
n = arm_copy_to_user(to, from, n);
uaccess_restore(__ua_flags);
return n;
+#else
+ return arm_copy_to_user(to, from, n);
+#endif
}
extern unsigned long __must_check
{
unsigned long flags;
char buf[64];
+#ifndef CONFIG_CPU_V7M
+ unsigned int domain;
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+ /*
+ * Get the domain register for the parent context. In user
+ * mode, we don't save the DACR, so lets use what it should
+ * be. For other modes, we place it after the pt_regs struct.
+ */
+ if (user_mode(regs))
+ domain = DACR_UACCESS_ENABLE;
+ else
+ domain = *(unsigned int *)(regs + 1);
+#else
+ domain = get_domain();
+#endif
+#endif
show_regs_print_info(KERN_DEFAULT);
#ifndef CONFIG_CPU_V7M
{
- unsigned int domain = get_domain();
const char *segment;
-#ifdef CONFIG_CPU_SW_DOMAIN_PAN
- /*
- * Get the domain register for the parent context. In user
- * mode, we don't save the DACR, so lets use what it should
- * be. For other modes, we place it after the pt_regs struct.
- */
- if (user_mode(regs))
- domain = DACR_UACCESS_ENABLE;
- else
- domain = *(unsigned int *)(regs + 1);
-#endif
-
if ((domain & domain_mask(DOMAIN_USER)) ==
domain_val(DOMAIN_USER, DOMAIN_NOACCESS))
segment = "none";
buf[0] = '\0';
#ifdef CONFIG_CPU_CP15_MMU
{
- unsigned int transbase, dac = get_domain();
+ unsigned int transbase;
asm("mrc p15, 0, %0, c2, c0\n\t"
: "=r" (transbase));
snprintf(buf, sizeof(buf), " Table: %08x DAC: %08x",
- transbase, dac);
+ transbase, domain);
}
#endif
asm("mrc p15, 0, %0, c1, c0\n" : "=r" (ctrl));
*/
#define __user_swpX_asm(data, addr, res, temp, B) \
__asm__ __volatile__( \
- " mov %2, %1\n" \
- "0: ldrex"B" %1, [%3]\n" \
- "1: strex"B" %0, %2, [%3]\n" \
+ "0: ldrex"B" %2, [%3]\n" \
+ "1: strex"B" %0, %1, [%3]\n" \
" cmp %0, #0\n" \
+ " moveq %1, %2\n" \
" movne %0, %4\n" \
"2:\n" \
" .section .text.fixup,\"ax\"\n" \
pid_t l_pid;
} __attribute__ ((packed,aligned(4)));
-asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
+static long do_locks(unsigned int fd, unsigned int cmd,
unsigned long arg)
{
- struct oabi_flock64 user;
struct flock64 kernel;
- mm_segment_t fs = USER_DS; /* initialized to kill a warning */
- unsigned long local_arg = arg;
- int ret;
+ struct oabi_flock64 user;
+ mm_segment_t fs;
+ long ret;
+
+ if (copy_from_user(&user, (struct oabi_flock64 __user *)arg,
+ sizeof(user)))
+ return -EFAULT;
+ kernel.l_type = user.l_type;
+ kernel.l_whence = user.l_whence;
+ kernel.l_start = user.l_start;
+ kernel.l_len = user.l_len;
+ kernel.l_pid = user.l_pid;
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_fcntl64(fd, cmd, (unsigned long)&kernel);
+ set_fs(fs);
+
+ if (!ret && (cmd == F_GETLK64 || cmd == F_OFD_GETLK)) {
+ user.l_type = kernel.l_type;
+ user.l_whence = kernel.l_whence;
+ user.l_start = kernel.l_start;
+ user.l_len = kernel.l_len;
+ user.l_pid = kernel.l_pid;
+ if (copy_to_user((struct oabi_flock64 __user *)arg,
+ &user, sizeof(user)))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
+ unsigned long arg)
+{
switch (cmd) {
case F_OFD_GETLK:
case F_OFD_SETLK:
case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
- if (copy_from_user(&user, (struct oabi_flock64 __user *)arg,
- sizeof(user)))
- return -EFAULT;
- kernel.l_type = user.l_type;
- kernel.l_whence = user.l_whence;
- kernel.l_start = user.l_start;
- kernel.l_len = user.l_len;
- kernel.l_pid = user.l_pid;
- local_arg = (unsigned long)&kernel;
- fs = get_fs();
- set_fs(KERNEL_DS);
- }
-
- ret = sys_fcntl64(fd, cmd, local_arg);
+ return do_locks(fd, cmd, arg);
- switch (cmd) {
- case F_GETLK64:
- if (!ret) {
- user.l_type = kernel.l_type;
- user.l_whence = kernel.l_whence;
- user.l_start = kernel.l_start;
- user.l_len = kernel.l_len;
- user.l_pid = kernel.l_pid;
- if (copy_to_user((struct oabi_flock64 __user *)arg,
- &user, sizeof(user)))
- ret = -EFAULT;
- }
- case F_SETLK64:
- case F_SETLKW64:
- set_fs(fs);
+ default:
+ return sys_fcntl64(fd, cmd, arg);
}
-
- return ret;
}
struct oabi_epoll_event {
static unsigned long noinline
__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
{
+ unsigned long ua_flags;
int atomic;
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
if (tocopy > n)
tocopy = n;
+ ua_flags = uaccess_save_and_enable();
memcpy((void *)to, from, tocopy);
+ uaccess_restore(ua_flags);
to += tocopy;
from += tocopy;
n -= tocopy;
* With frame pointer disabled, tail call optimization kicks in
* as well making this test almost invisible.
*/
- if (n < 64)
- return __copy_to_user_std(to, from, n);
- return __copy_to_user_memcpy(to, from, n);
+ if (n < 64) {
+ unsigned long ua_flags = uaccess_save_and_enable();
+ n = __copy_to_user_std(to, from, n);
+ uaccess_restore(ua_flags);
+ } else {
+ n = __copy_to_user_memcpy(to, from, n);
+ }
+ return n;
}
static unsigned long noinline
__clear_user_memset(void __user *addr, unsigned long n)
{
+ unsigned long ua_flags;
+
if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
memset((void *)addr, 0, n);
return 0;
if (tocopy > n)
tocopy = n;
+ ua_flags = uaccess_save_and_enable();
memset((void *)addr, 0, tocopy);
+ uaccess_restore(ua_flags);
addr += tocopy;
n -= tocopy;
unsigned long arm_clear_user(void __user *addr, unsigned long n)
{
/* See rational for this in __copy_to_user() above. */
- if (n < 64)
- return __clear_user_std(addr, n);
- return __clear_user_memset(addr, n);
+ if (n < 64) {
+ unsigned long ua_flags = uaccess_save_and_enable();
+ n = __clear_user_std(addr, n);
+ uaccess_restore(ua_flags);
+ } else {
+ n = __clear_user_memset(addr, n);
+ }
+ return n;
}
#if 0
select MACH_OMAP_GENERIC
select MIGHT_HAVE_CACHE_L2X0
select HAVE_ARM_SCU
+ select GENERIC_CLOCKEVENTS_BROADCAST
+ select HAVE_ARM_TWD
config SOC_DRA7XX
bool "TI DRA7XX"
freq = 104;
break;
default:
- freq = 54;
- break;
+ pr_err("onenand rate not detected, bad GPMC async timings?\n");
+ freq = 0;
}
return freq;
struct gpmc_timings t;
int ret;
+ /*
+ * Note that we need to keep sync_write set for the call to
+ * omap2_onenand_set_async_mode() to work to detect the onenand
+ * supported clock rate for the sync timings.
+ */
if (gpmc_onenand_data->of_node) {
gpmc_read_settings_dt(gpmc_onenand_data->of_node,
&onenand_async);
else
gpmc_onenand_data->flags |= ONENAND_SYNC_READ;
onenand_async.sync_read = false;
- onenand_async.sync_write = false;
}
}
- omap2_onenand_set_async_mode(onenand_base);
-
omap2_onenand_calc_async_timings(&t);
ret = gpmc_cs_program_settings(gpmc_onenand_data->cs, &onenand_async);
if (!freq) {
/* Very first call freq is not known */
freq = omap2_onenand_get_freq(gpmc_onenand_data, onenand_base);
+ if (!freq)
+ return -ENODEV;
set_onenand_cfg(onenand_base);
}
return r;
}
+#if !defined(CONFIG_SMP) && defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)
+void tick_broadcast(const struct cpumask *mask)
+{
+}
+#endif
+
static void __init omap2_gp_clockevent_init(int gptimer_id,
const char *fck_source,
const char *property)
__flush_icache_all();
}
-static int is_reserved_asid(u64 asid)
+static bool check_update_reserved_asid(u64 asid, u64 newasid)
{
int cpu;
- for_each_possible_cpu(cpu)
- if (per_cpu(reserved_asids, cpu) == asid)
- return 1;
- return 0;
+ bool hit = false;
+
+ /*
+ * Iterate over the set of reserved ASIDs looking for a match.
+ * If we find one, then we can update our mm to use newasid
+ * (i.e. the same ASID in the current generation) but we can't
+ * exit the loop early, since we need to ensure that all copies
+ * of the old ASID are updated to reflect the mm. Failure to do
+ * so could result in us missing the reserved ASID in a future
+ * generation.
+ */
+ for_each_possible_cpu(cpu) {
+ if (per_cpu(reserved_asids, cpu) == asid) {
+ hit = true;
+ per_cpu(reserved_asids, cpu) = newasid;
+ }
+ }
+
+ return hit;
}
static u64 new_context(struct mm_struct *mm, unsigned int cpu)
u64 generation = atomic64_read(&asid_generation);
if (asid != 0) {
+ u64 newasid = generation | (asid & ~ASID_MASK);
+
/*
* If our current ASID was active during a rollover, we
* can continue to use it and this was just a false alarm.
*/
- if (is_reserved_asid(asid))
- return generation | (asid & ~ASID_MASK);
+ if (check_update_reserved_asid(asid, newasid))
+ return newasid;
/*
* We had a valid ASID in a previous life, so try to re-use
*/
asid &= ~ASID_MASK;
if (!__test_and_set_bit(asid, asid_map))
- goto bump_gen;
+ return newasid;
}
/*
__set_bit(asid, asid_map);
cur_idx = asid;
-
-bump_gen:
- asid |= generation;
cpumask_clear(mm_cpumask(mm));
- return asid;
+ return asid | generation;
}
void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
return -ENOMEM;
for (count = 0, s = sg; count < (size >> PAGE_SHIFT); s = sg_next(s)) {
- phys_addr_t phys = sg_phys(s) & PAGE_MASK;
+ phys_addr_t phys = page_to_phys(sg_page(s));
unsigned int len = PAGE_ALIGN(s->offset + s->length);
if (!is_coherent &&
#include <linux/memblock.h>
#include <linux/dma-contiguous.h>
#include <linux/sizes.h>
+#include <linux/stop_machine.h>
#include <asm/cp15.h>
#include <asm/mach-types.h>
* safe to be called with preemption disabled, as under stop_machine().
*/
static inline void section_update(unsigned long addr, pmdval_t mask,
- pmdval_t prot)
+ pmdval_t prot, struct mm_struct *mm)
{
- struct mm_struct *mm;
pmd_t *pmd;
- mm = current->active_mm;
pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr);
#ifdef CONFIG_ARM_LPAE
return !!(get_cr() & CR_XP);
}
-#define set_section_perms(perms, field) { \
- size_t i; \
- unsigned long addr; \
- \
- if (!arch_has_strict_perms()) \
- return; \
- \
- for (i = 0; i < ARRAY_SIZE(perms); i++) { \
- if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) || \
- !IS_ALIGNED(perms[i].end, SECTION_SIZE)) { \
- pr_err("BUG: section %lx-%lx not aligned to %lx\n", \
- perms[i].start, perms[i].end, \
- SECTION_SIZE); \
- continue; \
- } \
- \
- for (addr = perms[i].start; \
- addr < perms[i].end; \
- addr += SECTION_SIZE) \
- section_update(addr, perms[i].mask, \
- perms[i].field); \
- } \
+void set_section_perms(struct section_perm *perms, int n, bool set,
+ struct mm_struct *mm)
+{
+ size_t i;
+ unsigned long addr;
+
+ if (!arch_has_strict_perms())
+ return;
+
+ for (i = 0; i < n; i++) {
+ if (!IS_ALIGNED(perms[i].start, SECTION_SIZE) ||
+ !IS_ALIGNED(perms[i].end, SECTION_SIZE)) {
+ pr_err("BUG: section %lx-%lx not aligned to %lx\n",
+ perms[i].start, perms[i].end,
+ SECTION_SIZE);
+ continue;
+ }
+
+ for (addr = perms[i].start;
+ addr < perms[i].end;
+ addr += SECTION_SIZE)
+ section_update(addr, perms[i].mask,
+ set ? perms[i].prot : perms[i].clear, mm);
+ }
+
}
-static inline void fix_kernmem_perms(void)
+static void update_sections_early(struct section_perm perms[], int n)
{
- set_section_perms(nx_perms, prot);
+ struct task_struct *t, *s;
+
+ read_lock(&tasklist_lock);
+ for_each_process(t) {
+ if (t->flags & PF_KTHREAD)
+ continue;
+ for_each_thread(t, s)
+ set_section_perms(perms, n, true, s->mm);
+ }
+ read_unlock(&tasklist_lock);
+ set_section_perms(perms, n, true, current->active_mm);
+ set_section_perms(perms, n, true, &init_mm);
+}
+
+int __fix_kernmem_perms(void *unused)
+{
+ update_sections_early(nx_perms, ARRAY_SIZE(nx_perms));
+ return 0;
+}
+
+void fix_kernmem_perms(void)
+{
+ stop_machine(__fix_kernmem_perms, NULL, NULL);
}
#ifdef CONFIG_DEBUG_RODATA
+int __mark_rodata_ro(void *unused)
+{
+ update_sections_early(ro_perms, ARRAY_SIZE(ro_perms));
+ return 0;
+}
+
void mark_rodata_ro(void)
{
- set_section_perms(ro_perms, prot);
+ stop_machine(__mark_rodata_ro, NULL, NULL);
}
void set_kernel_text_rw(void)
{
- set_section_perms(ro_perms, clear);
+ set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false,
+ current->active_mm);
}
void set_kernel_text_ro(void)
{
- set_section_perms(ro_perms, prot);
+ set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true,
+ current->active_mm);
}
#endif /* CONFIG_DEBUG_RODATA */
.equ cpu_v7_suspend_size, 4 * 9
#ifdef CONFIG_ARM_CPU_SUSPEND
ENTRY(cpu_v7_do_suspend)
- stmfd sp!, {r4 - r10, lr}
+ stmfd sp!, {r4 - r11, lr}
mrc p15, 0, r4, c13, c0, 0 @ FCSE/PID
mrc p15, 0, r5, c13, c0, 3 @ User r/o thread ID
stmia r0!, {r4 - r5}
mrc p15, 0, r9, c1, c0, 1 @ Auxiliary control register
mrc p15, 0, r10, c1, c0, 2 @ Co-processor access control
stmia r0, {r5 - r11}
- ldmfd sp!, {r4 - r10, pc}
+ ldmfd sp!, {r4 - r11, pc}
ENDPROC(cpu_v7_do_suspend)
ENTRY(cpu_v7_do_resume)
return fls(ctx->seen & SEEN_MEM);
}
-static inline bool is_load_to_a(u16 inst)
-{
- switch (inst) {
- case BPF_LD | BPF_W | BPF_LEN:
- case BPF_LD | BPF_W | BPF_ABS:
- case BPF_LD | BPF_H | BPF_ABS:
- case BPF_LD | BPF_B | BPF_ABS:
- return true;
- default:
- return false;
- }
-}
-
static void jit_fill_hole(void *area, unsigned int size)
{
u32 *ptr;
static void build_prologue(struct jit_ctx *ctx)
{
u16 reg_set = saved_regs(ctx);
- u16 first_inst = ctx->skf->insns[0].code;
u16 off;
#ifdef CONFIG_FRAME_POINTER
emit(ARM_MOV_I(r_X, 0), ctx);
/* do not leak kernel data to userspace */
- if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst)))
+ if (bpf_needs_clear_a(&ctx->skf->insns[0]))
emit(ARM_MOV_I(r_A, 0), ctx);
/* stack space for the BPF_MEM words */
case BPF_ALU | BPF_RSH | BPF_K:
if (unlikely(k > 31))
return -1;
- emit(ARM_LSR_I(r_A, r_A, k), ctx);
+ if (k)
+ emit(ARM_LSR_I(r_A, r_A, k), ctx);
break;
case BPF_ALU | BPF_RSH | BPF_X:
update_on_xread(ctx);
#endif /* !CONFIG_SMP */
#define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-#define tas(ptr) ((void)xchg((ptr), 1))
#endif /* __ARCH_BLACKFIN_CMPXCHG__ */
#define xchg(ptr, x) \
((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \
sizeof(*(ptr))))
-#define tas(ptr) xchg((ptr), 1)
-
#include <asm-generic/cmpxchg-local.h>
#endif
-#define tas(ptr) (xchg((ptr), 1))
-
/*****************************************************************************/
/*
* compare and conditionally exchange value with memory
___p1; \
})
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
/*
* The group barrier in front of the rsm & ssm are necessary to ensure
-#define NR_syscalls 322 /* length of syscall table */
+#define NR_syscalls 323 /* length of syscall table */
/*
* The following defines stop scripts/checksyscalls.sh from complaining about
#define __NR_userfaultfd 1343
#define __NR_membarrier 1344
#define __NR_kcmp 1345
+#define __NR_mlock2 1346
#endif /* _UAPI_ASM_IA64_UNISTD_H */
data8 sys_userfaultfd
data8 sys_membarrier
data8 sys_kcmp // 1345
+ data8 sys_mlock2
.org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
generic-y += cputime.h
generic-y += exec.h
generic-y += irq_work.h
+generic-y += kvm_para.h
generic-y += mcs_spinlock.h
generic-y += mm-arch-hooks.h
generic-y += module.h
#define writew_relaxed writew
#define writel_relaxed writel
-#define ioread8 read
+#define ioread8 readb
#define ioread16 readw
#define ioread32 readl
#define iowrite8 writeb
#define iowrite16 writew
#define iowrite32 writel
+#define ioread8_rep(p, dst, count) insb((unsigned long)(p), (dst), (count))
+#define ioread16_rep(p, dst, count) insw((unsigned long)(p), (dst), (count))
+#define ioread32_rep(p, dst, count) insl((unsigned long)(p), (dst), (count))
+
+#define iowrite8_rep(p, src, count) outsb((unsigned long)(p), (src), (count))
+#define iowrite16_rep(p, src, count) outsw((unsigned long)(p), (src), (count))
+#define iowrite32_rep(p, src, count) outsl((unsigned long)(p), (src), (count))
+
#define ioread16be(addr) be16_to_cpu(readw(addr))
#define ioread32be(addr) be32_to_cpu(readl(addr))
#define iowrite16be(v, addr) writew(cpu_to_be16(v), (addr))
/* FIXME this part of code is untested */
for_each_sg(sgl, sg, nents, i) {
sg->dma_address = sg_phys(sg);
- __dma_sync(sg_phys(sg), sg->length, direction);
+ __dma_sync(page_to_phys(sg_page(sg)) + sg->offset,
+ sg->length, direction);
}
return nents;
* On error, the variable @x is set to zero.
*/
#define __get_user_unaligned(x,ptr) \
- __get_user__unalignednocheck((x),(ptr),sizeof(*(ptr)))
+ __get_user_unaligned_nocheck((x),(ptr),sizeof(*(ptr)))
/*
* Yuck. We need two variants, one for 64bit operation and one
do { \
switch (size) { \
case 1: __get_data_asm(val, "lb", ptr); break; \
- case 2: __get_user_unaligned_asm(val, "ulh", ptr); break; \
- case 4: __get_user_unaligned_asm(val, "ulw", ptr); break; \
+ case 2: __get_data_unaligned_asm(val, "ulh", ptr); break; \
+ case 4: __get_data_unaligned_asm(val, "ulw", ptr); break; \
case 8: __GET_USER_UNALIGNED_DW(val, ptr); break; \
default: __get_user_unaligned_unknown(); break; \
} \
__cu_to = (to); \
__cu_from = (from); \
__cu_len = (n); \
- might_fault(); \
- __cu_len = __invoke_copy_from_user(__cu_to, __cu_from, \
- __cu_len); \
+ if (eva_kernel_access()) { \
+ __cu_len = __invoke_copy_from_kernel(__cu_to, \
+ __cu_from, \
+ __cu_len); \
+ } else { \
+ might_fault(); \
+ __cu_len = __invoke_copy_from_user(__cu_to, __cu_from, \
+ __cu_len); \
+ } \
__cu_len; \
})
{
__kernel_size_t res;
- might_fault();
- __asm__ __volatile__(
- "move\t$4, %1\n\t"
- "move\t$5, $0\n\t"
- "move\t$6, %2\n\t"
- __MODULE_JAL(__bzero)
- "move\t%0, $6"
- : "=r" (res)
- : "r" (addr), "r" (size)
- : "$4", "$5", "$6", __UA_t0, __UA_t1, "$31");
+ if (eva_kernel_access()) {
+ __asm__ __volatile__(
+ "move\t$4, %1\n\t"
+ "move\t$5, $0\n\t"
+ "move\t$6, %2\n\t"
+ __MODULE_JAL(__bzero_kernel)
+ "move\t%0, $6"
+ : "=r" (res)
+ : "r" (addr), "r" (size)
+ : "$4", "$5", "$6", __UA_t0, __UA_t1, "$31");
+ } else {
+ might_fault();
+ __asm__ __volatile__(
+ "move\t$4, %1\n\t"
+ "move\t$5, $0\n\t"
+ "move\t$6, %2\n\t"
+ __MODULE_JAL(__bzero)
+ "move\t%0, $6"
+ : "=r" (res)
+ : "r" (addr), "r" (size)
+ : "$4", "$5", "$6", __UA_t0, __UA_t1, "$31");
+ }
return res;
}
might_fault();
__asm__ __volatile__(
"move\t$4, %1\n\t"
- __MODULE_JAL(__strlen_kernel_asm)
+ __MODULE_JAL(__strlen_user_asm)
"move\t%0, $2"
: "=r" (res)
: "r" (s)
has_mt t0, 3f
.set push
- .set mips64r2
.set mt
/* Only allow 1 TC per VPE to execute... */
nop
.set push
- .set mips64r2
.set mt
1: /* Enter VPE configuration state */
#include <asm/fpu.h>
#include <asm/msa.h>
+extern void *__bzero_kernel(void *__s, size_t __count);
extern void *__bzero(void *__s, size_t __count);
extern long __strncpy_from_kernel_nocheck_asm(char *__to,
const char *__from, long __len);
EXPORT_SYMBOL(__copy_in_user_eva);
EXPORT_SYMBOL(__copy_to_user_eva);
EXPORT_SYMBOL(__copy_user_inatomic_eva);
+EXPORT_SYMBOL(__bzero_kernel);
#endif
EXPORT_SYMBOL(__bzero);
EXPORT_SYMBOL(__strncpy_from_kernel_nocheck_asm);
1:
#ifndef CONFIG_EVA
FEXPORT(__bzero)
+#else
+FEXPORT(__bzero_kernel)
#endif
__BUILD_BZERO LEGACY_MODE
return num;
}
-static bool is_load_to_a(u16 inst)
-{
- switch (inst) {
- case BPF_LD | BPF_W | BPF_LEN:
- case BPF_LD | BPF_W | BPF_ABS:
- case BPF_LD | BPF_H | BPF_ABS:
- case BPF_LD | BPF_B | BPF_ABS:
- return true;
- default:
- return false;
- }
-}
-
static void save_bpf_jit_regs(struct jit_ctx *ctx, unsigned offset)
{
int i = 0, real_off = 0;
static void build_prologue(struct jit_ctx *ctx)
{
- u16 first_inst = ctx->skf->insns[0].code;
int sp_off;
/* Calculate the total offset for the stack pointer */
emit_jit_reg_move(r_X, r_zero, ctx);
/* Do not leak kernel data to userspace */
- if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst)))
+ if (bpf_needs_clear_a(&ctx->skf->insns[0]))
emit_jit_reg_move(r_A, r_zero, ctx);
}
static int rt288x_pci_probe(struct platform_device *pdev)
{
void __iomem *io_map_base;
- int i;
rt2880_pci_base = ioremap_nocache(RT2880_PCI_BASE, PAGE_SIZE);
void msp7120_reset(void)
{
void *start, *end, *iptr;
- register int i;
/* Diasble all interrupts */
local_irq_disable();
/* XXX This ends up at the ARC firmware prompt ... */
void sni_machine_restart(char *command)
{
- int i, j;
+ int i;
/* This does a normal via the keyboard controller like a PC.
We can do that easier ... */
# the comments on that file.
#
ifndef CONFIG_CPU_MIPSR6
- ifeq ($(call ld-ifversion, -gt, 22400000, y),)
- $(warning MIPS VDSO requires binutils > 2.24)
+ ifeq ($(call ld-ifversion, -lt, 22500000, y),y)
+ $(warning MIPS VDSO requires binutils >= 2.25)
obj-vdso-y := $(filter-out gettimeofday.o, $(obj-vdso-y))
ccflags-vdso += -DDISABLE_MIPS_VDSO
endif
regs->gr[28]);
}
+/*
+ * Check how the syscall number gets loaded into %r20 within
+ * the delay branch in userspace and adjust as needed.
+ */
+
+static void check_syscallno_in_delay_branch(struct pt_regs *regs)
+{
+ u32 opcode, source_reg;
+ u32 __user *uaddr;
+ int err;
+
+ /* Usually we don't have to restore %r20 (the system call number)
+ * because it gets loaded in the delay slot of the branch external
+ * instruction via the ldi instruction.
+ * In some cases a register-to-register copy instruction might have
+ * been used instead, in which case we need to copy the syscall
+ * number into the source register before returning to userspace.
+ */
+
+ /* A syscall is just a branch, so all we have to do is fiddle the
+ * return pointer so that the ble instruction gets executed again.
+ */
+ regs->gr[31] -= 8; /* delayed branching */
+
+ /* Get assembler opcode of code in delay branch */
+ uaddr = (unsigned int *) ((regs->gr[31] & ~3) + 4);
+ err = get_user(opcode, uaddr);
+ if (err)
+ return;
+
+ /* Check if delay branch uses "ldi int,%r20" */
+ if ((opcode & 0xffff0000) == 0x34140000)
+ return; /* everything ok, just return */
+
+ /* Check if delay branch uses "nop" */
+ if (opcode == INSN_NOP)
+ return;
+
+ /* Check if delay branch uses "copy %rX,%r20" */
+ if ((opcode & 0xffe0ffff) == 0x08000254) {
+ source_reg = (opcode >> 16) & 31;
+ regs->gr[source_reg] = regs->gr[20];
+ return;
+ }
+
+ pr_warn("syscall restart: %s (pid %d): unexpected opcode 0x%08x\n",
+ current->comm, task_pid_nr(current), opcode);
+}
+
static inline void
syscall_restart(struct pt_regs *regs, struct k_sigaction *ka)
{
}
/* fallthrough */
case -ERESTARTNOINTR:
- /* A syscall is just a branch, so all
- * we have to do is fiddle the return pointer.
- */
- regs->gr[31] -= 8; /* delayed branching */
+ check_syscallno_in_delay_branch(regs);
break;
}
}
}
case -ERESTARTNOHAND:
case -ERESTARTSYS:
- case -ERESTARTNOINTR: {
- /* Hooray for delayed branching. We don't
- * have to restore %r20 (the system call
- * number) because it gets loaded in the delay
- * slot of the branch external instruction.
- */
- regs->gr[31] -= 8;
+ case -ERESTARTNOINTR:
+ check_syscallno_in_delay_branch(regs);
return;
- }
default:
break;
}
#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
#define wmb() __asm__ __volatile__ ("sync" : : : "memory")
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#ifdef __SUBARCH_HAS_LWSYNC
# define SMPWMB LWSYNC
PPC64ONLY(switch_endian)
SYSCALL_SPU(userfaultfd)
SYSCALL_SPU(membarrier)
-SYSCALL(semop)
-SYSCALL(semget)
-COMPAT_SYS(semctl)
-COMPAT_SYS(semtimedop)
-COMPAT_SYS(msgsnd)
-COMPAT_SYS(msgrcv)
-SYSCALL(msgget)
-COMPAT_SYS(msgctl)
-COMPAT_SYS(shmat)
-SYSCALL(shmdt)
-SYSCALL(shmget)
-COMPAT_SYS(shmctl)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
SYSCALL(mlock2)
#define __NR_switch_endian 363
#define __NR_userfaultfd 364
#define __NR_membarrier 365
-#define __NR_semop 366
-#define __NR_semget 367
-#define __NR_semctl 368
-#define __NR_semtimedop 369
-#define __NR_msgsnd 370
-#define __NR_msgrcv 371
-#define __NR_msgget 372
-#define __NR_msgctl 373
-#define __NR_shmat 374
-#define __NR_shmdt 375
-#define __NR_shmget 376
-#define __NR_shmctl 377
#define __NR_mlock2 378
#endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
{
+ /*
+ * Check for illegal transactional state bit combination
+ * and if we find it, force the TS field to a safe state.
+ */
+ if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
+ msr &= ~MSR_TS_MASK;
vcpu->arch.shregs.msr = msr;
kvmppc_end_cede(vcpu);
}
PPC_LI(r_X, 0);
}
- switch (filter[0].code) {
- case BPF_RET | BPF_K:
- case BPF_LD | BPF_W | BPF_LEN:
- case BPF_LD | BPF_W | BPF_ABS:
- case BPF_LD | BPF_H | BPF_ABS:
- case BPF_LD | BPF_B | BPF_ABS:
- /* first instruction sets A register (or is RET 'constant') */
- break;
- default:
- /* make sure we dont leak kernel information to user */
+ /* make sure we dont leak kernel information to user */
+ if (bpf_needs_clear_a(&filter[0]))
PPC_LI(r_A, 0);
- }
}
static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)
set_bit(d->hwirq, &opal_event_irqchip.mask);
opal_poll_events(&events);
- opal_handle_events(be64_to_cpu(events));
+ last_outstanding_events = be64_to_cpu(events);
+
+ /*
+ * We can't just handle the events now with opal_handle_events().
+ * If we did we would deadlock when opal_event_unmask() is called from
+ * handle_level_irq() with the irq descriptor lock held, because
+ * calling opal_handle_events() would call generic_handle_irq() and
+ * then handle_level_irq() which would try to take the descriptor lock
+ * again. Instead queue the events for later.
+ */
+ if (last_outstanding_events & opal_event_irqchip.mask)
+ /* Need to retrigger the interrupt */
+ irq_work_queue(&opal_event_irq_work);
}
static int opal_event_set_type(struct irq_data *d, unsigned int flow_type)
/* Sanity check */
if (type >= OPAL_MSG_TYPE_MAX) {
- pr_warning("%s: Unknown message type: %u\n", __func__, type);
+ pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
return;
}
opal_message_do_notify(type, (void *)&msg);
#define smp_mb__before_atomic() smp_mb()
#define smp_mb__after_atomic() smp_mb()
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#define smp_store_release(p, v) \
do { \
}
if (separator)
ptr += sprintf(ptr, "%c", separator);
+ /*
+ * Use four '%' characters below because of the
+ * following two conversions:
+ *
+ * 1) sprintf: %%%%r -> %%r
+ * 2) printk : %%r -> %r
+ */
if (operand->flags & OPERAND_GPR)
- ptr += sprintf(ptr, "%%r%i", value);
+ ptr += sprintf(ptr, "%%%%r%i", value);
else if (operand->flags & OPERAND_FPR)
- ptr += sprintf(ptr, "%%f%i", value);
+ ptr += sprintf(ptr, "%%%%f%i", value);
else if (operand->flags & OPERAND_AR)
- ptr += sprintf(ptr, "%%a%i", value);
+ ptr += sprintf(ptr, "%%%%a%i", value);
else if (operand->flags & OPERAND_CR)
- ptr += sprintf(ptr, "%%c%i", value);
+ ptr += sprintf(ptr, "%%%%c%i", value);
else if (operand->flags & OPERAND_VR)
- ptr += sprintf(ptr, "%%v%i", value);
+ ptr += sprintf(ptr, "%%%%v%i", value);
else if (operand->flags & OPERAND_PCREL)
ptr += sprintf(ptr, "%lx", (signed int) value
+ addr);
* really available. So we simply advertise only "crypto" support.
*/
#define HWCAP_SPARC_CRYPTO 0x04000000 /* CRYPTO insns available */
+#define HWCAP_SPARC_ADI 0x08000000 /* ADI available */
#define CORE_DUMP_USE_REGSET
#define __NR_bpf 349
#define __NR_execveat 350
#define __NR_membarrier 351
+#define __NR_userfaultfd 352
+#define __NR_bind 353
+#define __NR_listen 354
+#define __NR_setsockopt 355
+#define __NR_mlock2 356
-#define NR_syscalls 352
+#define NR_syscalls 357
/* Bitmask values returned from kern_features system call. */
#define KERN_FEATURE_MIXED_MODE_STACK 0x00000001
mov 1, %o0
ENDPROC(__retl_one)
+ENTRY(__retl_one_fp)
+ VISExitHalf
+ retl
+ mov 1, %o0
+ENDPROC(__retl_one_fp)
+
ENTRY(__ret_one_asi)
wr %g0, ASI_AIUS, %asi
ret
mov 1, %o0
ENDPROC(__retl_one_asi)
+ENTRY(__retl_one_asi_fp)
+ wr %g0, ASI_AIUS, %asi
+ VISExitHalf
+ retl
+ mov 1, %o0
+ENDPROC(__retl_one_asi_fp)
+
ENTRY(__retl_o1)
retl
mov %o1, %o0
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
{
+ u64 saved_fault_address = current_thread_info()->fault_address;
+ u8 saved_fault_code = get_thread_fault_code();
+ mm_segment_t old_fs;
+
perf_callchain_store(entry, regs->tpc);
if (!current->mm)
return;
+ old_fs = get_fs();
+ set_fs(USER_DS);
+
flushw_user();
pagefault_disable();
perf_callchain_user_64(entry, regs);
pagefault_enable();
+
+ set_fs(old_fs);
+ set_thread_fault_code(saved_fault_code);
+ current_thread_info()->fault_address = saved_fault_address;
}
andn %l1, %l4, %l1
srl %l4, 20, %l4
ba,pt %xcc, rtrap_no_irq_enable
- wrpr %l4, %pil
+ nop
+ /* Do not actually set the %pil here. We will do that
+ * below after we clear PSTATE_IE in the %pstate register.
+ * If we re-enable interrupts here, we can recurse down
+ * the hardirq stack potentially endlessly, causing a
+ * stack overflow.
+ */
.align 64
.globl rtrap_irq, rtrap, irqsz_patchme, rtrap_xcall
*/
"mul32", "div32", "fsmuld", "v8plus", "popc", "vis", "vis2",
"ASIBlkInit", "fmaf", "vis3", "hpc", "random", "trans", "fjfmau",
- "ima", "cspare", "pause", "cbcond",
+ "ima", "cspare", "pause", "cbcond", NULL /*reserved for crypto */,
+ "adp",
};
static const char *crypto_hwcaps[] = {
seq_puts(m, "cpucaps\t\t: ");
for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
unsigned long bit = 1UL << i;
- if (caps & bit) {
+ if (hwcaps[i] && (caps & bit)) {
seq_printf(m, "%s%s",
printed ? "," : "", hwcaps[i]);
printed++;
for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
unsigned long bit = 1UL << i;
- if (caps & bit)
+ if (hwcaps[i] && (caps & bit))
report_one_hwcap(&printed, hwcaps[i]);
}
if (caps & HWCAP_SPARC_CRYPTO)
for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
unsigned long bit = 1UL << i;
- if (!strcmp(prop, hwcaps[i])) {
+ if (hwcaps[i] && !strcmp(prop, hwcaps[i])) {
caps |= bit;
break;
}
/*80*/ .long sys_setgroups16, sys_getpgrp, sys_setgroups, sys_setitimer, sys_ftruncate64
/*85*/ .long sys_swapon, sys_getitimer, sys_setuid, sys_sethostname, sys_setgid
/*90*/ .long sys_dup2, sys_setfsuid, sys_fcntl, sys_select, sys_setfsgid
-/*95*/ .long sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+/*95*/ .long sys_fsync, sys_setpriority, sys_socket, sys_connect, sys_accept
/*100*/ .long sys_getpriority, sys_rt_sigreturn, sys_rt_sigaction, sys_rt_sigprocmask, sys_rt_sigpending
/*105*/ .long sys_rt_sigtimedwait, sys_rt_sigqueueinfo, sys_rt_sigsuspend, sys_setresuid, sys_getresuid
-/*110*/ .long sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall
-/*115*/ .long sys_getgroups, sys_gettimeofday, sys_getrusage, sys_nis_syscall, sys_getcwd
+/*110*/ .long sys_setresgid, sys_getresgid, sys_setregid, sys_recvmsg, sys_sendmsg
+/*115*/ .long sys_getgroups, sys_gettimeofday, sys_getrusage, sys_getsockopt, sys_getcwd
/*120*/ .long sys_readv, sys_writev, sys_settimeofday, sys_fchown16, sys_fchmod
-/*125*/ .long sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate
-/*130*/ .long sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall
-/*135*/ .long sys_nis_syscall, sys_mkdir, sys_rmdir, sys_utimes, sys_stat64
-/*140*/ .long sys_sendfile64, sys_nis_syscall, sys_futex, sys_gettid, sys_getrlimit
+/*125*/ .long sys_recvfrom, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate
+/*130*/ .long sys_ftruncate, sys_flock, sys_lstat64, sys_sendto, sys_shutdown
+/*135*/ .long sys_socketpair, sys_mkdir, sys_rmdir, sys_utimes, sys_stat64
+/*140*/ .long sys_sendfile64, sys_getpeername, sys_futex, sys_gettid, sys_getrlimit
/*145*/ .long sys_setrlimit, sys_pivot_root, sys_prctl, sys_pciconfig_read, sys_pciconfig_write
-/*150*/ .long sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
+/*150*/ .long sys_getsockname, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
/*155*/ .long sys_fcntl64, sys_inotify_rm_watch, sys_statfs, sys_fstatfs, sys_oldumount
/*160*/ .long sys_sched_setaffinity, sys_sched_getaffinity, sys_getdomainname, sys_setdomainname, sys_nis_syscall
/*165*/ .long sys_quotactl, sys_set_tid_address, sys_mount, sys_ustat, sys_setxattr
/*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
/*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
/*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .long sys_execveat, sys_membarrier
+/*350*/ .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
+/*355*/ .long sys_setsockopt, sys_mlock2
/*80*/ .word sys_setgroups16, sys_getpgrp, sys_setgroups, compat_sys_setitimer, sys32_ftruncate64
.word sys_swapon, compat_sys_getitimer, sys_setuid, sys_sethostname, sys_setgid
/*90*/ .word sys_dup2, sys_setfsuid, compat_sys_fcntl, sys32_select, sys_setfsgid
- .word sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+ .word sys_fsync, sys_setpriority, sys_socket, sys_connect, sys_accept
/*100*/ .word sys_getpriority, sys32_rt_sigreturn, compat_sys_rt_sigaction, compat_sys_rt_sigprocmask, compat_sys_rt_sigpending
.word compat_sys_rt_sigtimedwait, compat_sys_rt_sigqueueinfo, compat_sys_rt_sigsuspend, sys_setresuid, sys_getresuid
-/*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall
- .word sys_getgroups, compat_sys_gettimeofday, compat_sys_getrusage, sys_nis_syscall, sys_getcwd
+/*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, compat_sys_recvmsg, compat_sys_sendmsg
+ .word sys_getgroups, compat_sys_gettimeofday, compat_sys_getrusage, compat_sys_getsockopt, sys_getcwd
/*120*/ .word compat_sys_readv, compat_sys_writev, compat_sys_settimeofday, sys_fchown16, sys_fchmod
- .word sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, compat_sys_truncate
-/*130*/ .word compat_sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall
- .word sys_nis_syscall, sys_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64
+ .word sys_recvfrom, sys_setreuid16, sys_setregid16, sys_rename, compat_sys_truncate
+/*130*/ .word compat_sys_ftruncate, sys_flock, compat_sys_lstat64, sys_sendto, sys_shutdown
+ .word sys_socketpair, sys_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64
/*140*/ .word sys_sendfile64, sys_nis_syscall, sys32_futex, sys_gettid, compat_sys_getrlimit
.word compat_sys_setrlimit, sys_pivot_root, sys_prctl, sys_pciconfig_read, sys_pciconfig_write
/*150*/ .word sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64
.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .word sys32_execveat, sys_membarrier
+/*350*/ .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
+ .word compat_sys_setsockopt, sys_mlock2
#endif /* CONFIG_COMPAT */
.word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev
/*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
-/*350*/ .word sys64_execveat, sys_membarrier
+/*350*/ .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
+ .word sys_setsockopt, sys_mlock2
.text; \
.align 4;
+#define EX_LD_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_asi_fp;\
+ .text; \
+ .align 4;
+
#ifndef ASI_AIUS
#define ASI_AIUS 0x11
#endif
.text; \
.align 4;
+#define EX_ST_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_asi_fp;\
+ .text; \
+ .align 4;
+
#ifndef ASI_AIUS
#define ASI_AIUS 0x11
#endif
#ifndef EX_LD
#define EX_LD(x) x
#endif
+#ifndef EX_LD_FP
+#define EX_LD_FP(x) x
+#endif
#ifndef EX_ST
#define EX_ST(x) x
#endif
+#ifndef EX_ST_FP
+#define EX_ST_FP(x) x
+#endif
#ifndef EX_RETVAL
#define EX_RETVAL(x) x
fsrc2 %x6, %f12; \
fsrc2 %x7, %f14;
#define FREG_LOAD_1(base, x0) \
- EX_LD(LOAD(ldd, base + 0x00, %x0))
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0))
#define FREG_LOAD_2(base, x0, x1) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1));
#define FREG_LOAD_3(base, x0, x1, x2) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1)); \
- EX_LD(LOAD(ldd, base + 0x10, %x2));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
+ EX_LD_FP(LOAD(ldd, base + 0x10, %x2));
#define FREG_LOAD_4(base, x0, x1, x2, x3) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1)); \
- EX_LD(LOAD(ldd, base + 0x10, %x2)); \
- EX_LD(LOAD(ldd, base + 0x18, %x3));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
+ EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
+ EX_LD_FP(LOAD(ldd, base + 0x18, %x3));
#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1)); \
- EX_LD(LOAD(ldd, base + 0x10, %x2)); \
- EX_LD(LOAD(ldd, base + 0x18, %x3)); \
- EX_LD(LOAD(ldd, base + 0x20, %x4));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
+ EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
+ EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
+ EX_LD_FP(LOAD(ldd, base + 0x20, %x4));
#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1)); \
- EX_LD(LOAD(ldd, base + 0x10, %x2)); \
- EX_LD(LOAD(ldd, base + 0x18, %x3)); \
- EX_LD(LOAD(ldd, base + 0x20, %x4)); \
- EX_LD(LOAD(ldd, base + 0x28, %x5));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
+ EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
+ EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
+ EX_LD_FP(LOAD(ldd, base + 0x20, %x4)); \
+ EX_LD_FP(LOAD(ldd, base + 0x28, %x5));
#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
- EX_LD(LOAD(ldd, base + 0x00, %x0)); \
- EX_LD(LOAD(ldd, base + 0x08, %x1)); \
- EX_LD(LOAD(ldd, base + 0x10, %x2)); \
- EX_LD(LOAD(ldd, base + 0x18, %x3)); \
- EX_LD(LOAD(ldd, base + 0x20, %x4)); \
- EX_LD(LOAD(ldd, base + 0x28, %x5)); \
- EX_LD(LOAD(ldd, base + 0x30, %x6));
+ EX_LD_FP(LOAD(ldd, base + 0x00, %x0)); \
+ EX_LD_FP(LOAD(ldd, base + 0x08, %x1)); \
+ EX_LD_FP(LOAD(ldd, base + 0x10, %x2)); \
+ EX_LD_FP(LOAD(ldd, base + 0x18, %x3)); \
+ EX_LD_FP(LOAD(ldd, base + 0x20, %x4)); \
+ EX_LD_FP(LOAD(ldd, base + 0x28, %x5)); \
+ EX_LD_FP(LOAD(ldd, base + 0x30, %x6));
.register %g2,#scratch
.register %g3,#scratch
nop
/* fall through for 0 < low bits < 8 */
110: sub %o4, 64, %g2
- EX_LD(LOAD_BLK(%g2, %f0))
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+ EX_LD_FP(LOAD_BLK(%g2, %f0))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
120: sub %o4, 56, %g2
FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
130: sub %o4, 48, %g2
FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
140: sub %o4, 40, %g2
FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_5(f22, f24, f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
150: sub %o4, 32, %g2
FREG_LOAD_4(%g2, f0, f2, f4, f6)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_4(f24, f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
160: sub %o4, 24, %g2
FREG_LOAD_3(%g2, f0, f2, f4)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_3(f26, f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
170: sub %o4, 16, %g2
FREG_LOAD_2(%g2, f0, f2)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_2(f28, f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
180: sub %o4, 8, %g2
FREG_LOAD_1(%g2, f0)
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
- EX_LD(LOAD_BLK(%o4, %f16))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f16))
FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
FREG_MOVE_1(f30)
subcc %g1, 64, %g1
add %o4, 64, %o4
nop
190:
-1: EX_ST(STORE_INIT(%g0, %o4 + %g3))
+1: EX_ST_FP(STORE_INIT(%g0, %o4 + %g3))
subcc %g1, 64, %g1
- EX_LD(LOAD_BLK(%o4, %f0))
- EX_ST(STORE_BLK(%f0, %o4 + %g3))
+ EX_LD_FP(LOAD_BLK(%o4, %f0))
+ EX_ST_FP(STORE_BLK(%f0, %o4 + %g3))
add %o4, 64, %o4
bne,pt %xcc, 1b
LOAD(prefetch, %o4 + 64, #one_read)
.text; \
.align 4;
+#define EX_LD_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_asi_fp;\
+ .text; \
+ .align 4;
+
#ifndef ASI_AIUS
#define ASI_AIUS 0x11
#endif
.text; \
.align 4;
+#define EX_ST_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_asi_fp;\
+ .text; \
+ .align 4;
+
#ifndef ASI_AIUS
#define ASI_AIUS 0x11
#endif
#ifndef EX_LD
#define EX_LD(x) x
#endif
+#ifndef EX_LD_FP
+#define EX_LD_FP(x) x
+#endif
#ifndef EX_ST
#define EX_ST(x) x
#endif
+#ifndef EX_ST_FP
+#define EX_ST_FP(x) x
+#endif
#ifndef EX_RETVAL
#define EX_RETVAL(x) x
sub %o2, %o4, %o2
alignaddr %o1, %g0, %g1
add %o1, %o4, %o1
- EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
-1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0))
+1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2))
subcc %o4, 0x40, %o4
- EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
- EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
- EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
- EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
- EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
- EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14))
faligndata %f0, %f2, %f16
- EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+ EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0))
faligndata %f2, %f4, %f18
add %g1, 0x40, %g1
faligndata %f4, %f6, %f20
faligndata %f10, %f12, %f26
faligndata %f12, %f14, %f28
faligndata %f14, %f0, %f30
- EX_ST(STORE(std, %f16, %o0 + 0x00))
- EX_ST(STORE(std, %f18, %o0 + 0x08))
- EX_ST(STORE(std, %f20, %o0 + 0x10))
- EX_ST(STORE(std, %f22, %o0 + 0x18))
- EX_ST(STORE(std, %f24, %o0 + 0x20))
- EX_ST(STORE(std, %f26, %o0 + 0x28))
- EX_ST(STORE(std, %f28, %o0 + 0x30))
- EX_ST(STORE(std, %f30, %o0 + 0x38))
+ EX_ST_FP(STORE(std, %f16, %o0 + 0x00))
+ EX_ST_FP(STORE(std, %f18, %o0 + 0x08))
+ EX_ST_FP(STORE(std, %f20, %o0 + 0x10))
+ EX_ST_FP(STORE(std, %f22, %o0 + 0x18))
+ EX_ST_FP(STORE(std, %f24, %o0 + 0x20))
+ EX_ST_FP(STORE(std, %f26, %o0 + 0x28))
+ EX_ST_FP(STORE(std, %f28, %o0 + 0x30))
+ EX_ST_FP(STORE(std, %f30, %o0 + 0x38))
add %o0, 0x40, %o0
bne,pt %icc, 1b
LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
.text; \
.align 4;
+#define EX_LD_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_fp;\
+ .text; \
+ .align 4;
+
#define FUNC_NAME ___copy_from_user
#define LOAD(type,addr,dest) type##a [addr] %asi, dest
#define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_AIUS, dest
.text; \
.align 4;
+#define EX_ST_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_fp;\
+ .text; \
+ .align 4;
+
#define FUNC_NAME ___copy_to_user
#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS
#ifndef EX_LD
#define EX_LD(x) x
#endif
+#ifndef EX_LD_FP
+#define EX_LD_FP(x) x
+#endif
#ifndef EX_ST
#define EX_ST(x) x
#endif
+#ifndef EX_ST_FP
+#define EX_ST_FP(x) x
+#endif
#ifndef EX_RETVAL
#define EX_RETVAL(x) x
faligndata %f8, %f9, %f62;
#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \
- EX_LD(LOAD_BLK(%src, %fdest)); \
- EX_ST(STORE_BLK(%fsrc, %dest)); \
+ EX_LD_FP(LOAD_BLK(%src, %fdest)); \
+ EX_ST_FP(STORE_BLK(%fsrc, %dest)); \
add %src, 0x40, %src; \
subcc %len, 0x40, %len; \
be,pn %xcc, jmptgt; \
#define DO_SYNC membar #Sync;
#define STORE_SYNC(dest, fsrc) \
- EX_ST(STORE_BLK(%fsrc, %dest)); \
+ EX_ST_FP(STORE_BLK(%fsrc, %dest)); \
add %dest, 0x40, %dest; \
DO_SYNC
#define STORE_JUMP(dest, fsrc, target) \
- EX_ST(STORE_BLK(%fsrc, %dest)); \
+ EX_ST_FP(STORE_BLK(%fsrc, %dest)); \
add %dest, 0x40, %dest; \
ba,pt %xcc, target; \
nop;
subcc %left, 8, %left;\
bl,pn %xcc, 95f; \
faligndata %f0, %f1, %f48; \
- EX_ST(STORE(std, %f48, %dest)); \
+ EX_ST_FP(STORE(std, %f48, %dest)); \
add %dest, 8, %dest;
#define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \
and %g2, 0x38, %g2
1: subcc %g1, 0x1, %g1
- EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
- EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
+ EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3))
+ EX_ST_FP(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
bgu,pt %XCC, 1b
add %o1, 0x1, %o1
be,pt %icc, 3f
alignaddr %o1, %g0, %o1
- EX_LD(LOAD(ldd, %o1, %f4))
-1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
+ EX_LD_FP(LOAD(ldd, %o1, %f4))
+1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f4, %f6, %f0
- EX_ST(STORE(std, %f0, %o0))
+ EX_ST_FP(STORE(std, %f0, %o0))
be,pn %icc, 3f
add %o0, 0x8, %o0
- EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f6, %f4, %f0
- EX_ST(STORE(std, %f0, %o0))
+ EX_ST_FP(STORE(std, %f0, %o0))
bne,pt %icc, 1b
add %o0, 0x8, %o0
add %g1, %GLOBAL_SPARE, %g1
subcc %o2, %g3, %o2
- EX_LD(LOAD_BLK(%o1, %f0))
+ EX_LD_FP(LOAD_BLK(%o1, %f0))
add %o1, 0x40, %o1
add %g1, %g3, %g1
- EX_LD(LOAD_BLK(%o1, %f16))
+ EX_LD_FP(LOAD_BLK(%o1, %f16))
add %o1, 0x40, %o1
sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
- EX_LD(LOAD_BLK(%o1, %f32))
+ EX_LD_FP(LOAD_BLK(%o1, %f32))
add %o1, 0x40, %o1
/* There are 8 instances of the unrolled loop,
62: FINISH_VISCHUNK(o0, f44, f46, g3)
63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3)
-93: EX_LD(LOAD(ldd, %o1, %f2))
+93: EX_LD_FP(LOAD(ldd, %o1, %f2))
add %o1, 8, %o1
subcc %g3, 8, %g3
faligndata %f0, %f2, %f8
- EX_ST(STORE(std, %f8, %o0))
+ EX_ST_FP(STORE(std, %f8, %o0))
bl,pn %xcc, 95f
add %o0, 8, %o0
- EX_LD(LOAD(ldd, %o1, %f0))
+ EX_LD_FP(LOAD(ldd, %o1, %f0))
add %o1, 8, %o1
subcc %g3, 8, %g3
faligndata %f2, %f0, %f8
- EX_ST(STORE(std, %f8, %o0))
+ EX_ST_FP(STORE(std, %f8, %o0))
bge,pt %xcc, 93b
add %o0, 8, %o0
95: brz,pt %o2, 2f
mov %g1, %o1
-1: EX_LD(LOAD(ldub, %o1, %o3))
+1: EX_LD_FP(LOAD(ldub, %o1, %o3))
add %o1, 1, %o1
subcc %o2, 1, %o2
- EX_ST(STORE(stb, %o3, %o0))
+ EX_ST_FP(STORE(stb, %o3, %o0))
bne,pt %xcc, 1b
add %o0, 1, %o0
.text; \
.align 4;
+#define EX_LD_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_fp;\
+ .text; \
+ .align 4;
+
#define FUNC_NAME U3copy_from_user
#define LOAD(type,addr,dest) type##a [addr] %asi, dest
#define EX_RETVAL(x) 0
.text; \
.align 4;
+#define EX_ST_FP(x) \
+98: x; \
+ .section __ex_table,"a";\
+ .align 4; \
+ .word 98b, __retl_one_fp;\
+ .text; \
+ .align 4;
+
#define FUNC_NAME U3copy_to_user
#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS
#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_AIUS
#ifndef EX_LD
#define EX_LD(x) x
#endif
+#ifndef EX_LD_FP
+#define EX_LD_FP(x) x
+#endif
#ifndef EX_ST
#define EX_ST(x) x
#endif
+#ifndef EX_ST_FP
+#define EX_ST_FP(x) x
+#endif
#ifndef EX_RETVAL
#define EX_RETVAL(x) x
and %g2, 0x38, %g2
1: subcc %g1, 0x1, %g1
- EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
- EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
+ EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3))
+ EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
bgu,pt %XCC, 1b
add %o1, 0x1, %o1
be,pt %icc, 3f
alignaddr %o1, %g0, %o1
- EX_LD(LOAD(ldd, %o1, %f4))
-1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6))
+ EX_LD_FP(LOAD(ldd, %o1, %f4))
+1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f4, %f6, %f0
- EX_ST(STORE(std, %f0, %o0))
+ EX_ST_FP(STORE(std, %f0, %o0))
be,pn %icc, 3f
add %o0, 0x8, %o0
- EX_LD(LOAD(ldd, %o1 + 0x8, %f4))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f6, %f4, %f2
- EX_ST(STORE(std, %f2, %o0))
+ EX_ST_FP(STORE(std, %f2, %o0))
bne,pt %icc, 1b
add %o0, 0x8, %o0
LOAD(prefetch, %o1 + 0x080, #one_read)
LOAD(prefetch, %o1 + 0x0c0, #one_read)
LOAD(prefetch, %o1 + 0x100, #one_read)
- EX_LD(LOAD(ldd, %o1 + 0x000, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0))
LOAD(prefetch, %o1 + 0x140, #one_read)
- EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
LOAD(prefetch, %o1 + 0x180, #one_read)
- EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
LOAD(prefetch, %o1 + 0x1c0, #one_read)
faligndata %f0, %f2, %f16
- EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
faligndata %f2, %f4, %f18
- EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
faligndata %f4, %f6, %f20
- EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
faligndata %f6, %f8, %f22
- EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
faligndata %f8, %f10, %f24
- EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
faligndata %f10, %f12, %f26
- EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE
add %o1, 0x40, %o1
.align 64
1:
- EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
faligndata %f12, %f14, %f28
- EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
faligndata %f14, %f0, %f30
- EX_ST(STORE_BLK(%f16, %o0))
- EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
+ EX_ST_FP(STORE_BLK(%f16, %o0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
faligndata %f0, %f2, %f16
add %o0, 0x40, %o0
- EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
faligndata %f2, %f4, %f18
- EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
faligndata %f4, %f6, %f20
- EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
subcc %o3, 0x01, %o3
faligndata %f6, %f8, %f22
- EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
faligndata %f8, %f10, %f24
- EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
LOAD(prefetch, %o1 + 0x1c0, #one_read)
faligndata %f10, %f12, %f26
bg,pt %XCC, 1b
/* Finally we copy the last full 64-byte block. */
2:
- EX_LD(LOAD(ldd, %o1 + 0x008, %f2))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2))
faligndata %f12, %f14, %f28
- EX_LD(LOAD(ldd, %o1 + 0x010, %f4))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4))
faligndata %f14, %f0, %f30
- EX_ST(STORE_BLK(%f16, %o0))
- EX_LD(LOAD(ldd, %o1 + 0x018, %f6))
+ EX_ST_FP(STORE_BLK(%f16, %o0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6))
faligndata %f0, %f2, %f16
- EX_LD(LOAD(ldd, %o1 + 0x020, %f8))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8))
faligndata %f2, %f4, %f18
- EX_LD(LOAD(ldd, %o1 + 0x028, %f10))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10))
faligndata %f4, %f6, %f20
- EX_LD(LOAD(ldd, %o1 + 0x030, %f12))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12))
faligndata %f6, %f8, %f22
- EX_LD(LOAD(ldd, %o1 + 0x038, %f14))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14))
faligndata %f8, %f10, %f24
cmp %g1, 0
be,pt %XCC, 1f
add %o0, 0x40, %o0
- EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0))
1: faligndata %f10, %f12, %f26
faligndata %f12, %f14, %f28
faligndata %f14, %f0, %f30
- EX_ST(STORE_BLK(%f16, %o0))
+ EX_ST_FP(STORE_BLK(%f16, %o0))
add %o0, 0x40, %o0
add %o1, 0x40, %o1
membar #Sync
sub %o2, %g2, %o2
be,a,pt %XCC, 1f
- EX_LD(LOAD(ldd, %o1 + 0x00, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0))
-1: EX_LD(LOAD(ldd, %o1 + 0x08, %f2))
+1: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f0, %f2, %f8
- EX_ST(STORE(std, %f8, %o0))
+ EX_ST_FP(STORE(std, %f8, %o0))
be,pn %XCC, 2f
add %o0, 0x8, %o0
- EX_LD(LOAD(ldd, %o1 + 0x08, %f0))
+ EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0))
add %o1, 0x8, %o1
subcc %g2, 0x8, %g2
faligndata %f2, %f0, %f8
- EX_ST(STORE(std, %f8, %o0))
+ EX_ST_FP(STORE(std, %f8, %o0))
bne,pn %XCC, 1b
add %o0, 0x8, %o0
}
emit_reg_move(O7, r_saved_O7);
- switch (filter[0].code) {
- case BPF_RET | BPF_K:
- case BPF_LD | BPF_W | BPF_LEN:
- case BPF_LD | BPF_W | BPF_ABS:
- case BPF_LD | BPF_H | BPF_ABS:
- case BPF_LD | BPF_B | BPF_ABS:
- /* The first instruction sets the A register (or is
- * a "RET 'constant'")
- */
- break;
- default:
- /* Make sure we dont leak kernel information to the
- * user.
- */
+ /* Make sure we dont leak kernel information to the user. */
+ if (bpf_needs_clear_a(&filter[0]))
emit_clear(r_A); /* A = 0 */
- }
for (i = 0; i < flen; i++) {
unsigned int K = filter[i].k;
smaller kernel memory footprint results from using a smaller
value on chips with fewer tiles.
-if TILEGX
-
choice
prompt "Kernel page size"
default PAGE_SIZE_64KB
connections, etc., it may be better to select 16KB, which uses
memory more efficiently at some cost in TLB performance.
- Note that this option is TILE-Gx specific; currently
- TILEPro page size is set by rebuilding the hypervisor.
+ Note that for TILEPro, you must also rebuild the hypervisor
+ with a matching page size.
+
+config PAGE_SIZE_4KB
+ bool "4KB" if TILEPRO
config PAGE_SIZE_16KB
bool "16KB"
endchoice
-endif
-
source "kernel/Kconfig.hz"
config KEXEC
#endif
-#define tas(ptr) xchg((ptr), 1)
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_TILE_CMPXCHG_H */
#include <arch/chip.h>
/* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#if defined(CONFIG_PAGE_SIZE_16KB)
+#if defined(CONFIG_PAGE_SIZE_4KB) /* tilepro only */
+#define PAGE_SHIFT 12
+#define CTX_PAGE_FLAG HV_CTX_PG_SM_4K
+#elif defined(CONFIG_PAGE_SIZE_16KB)
#define PAGE_SHIFT 14
#define CTX_PAGE_FLAG HV_CTX_PG_SM_16K
#elif defined(CONFIG_PAGE_SIZE_64KB)
#define PAGE_SHIFT 16
#define CTX_PAGE_FLAG HV_CTX_PG_SM_64K
#else
-#define PAGE_SHIFT HV_LOG2_DEFAULT_PAGE_SIZE_SMALL
-#define CTX_PAGE_FLAG 0
+#error Page size not specified in Kconfig
#endif
#define HPAGE_SHIFT HV_LOG2_DEFAULT_PAGE_SIZE_LARGE
If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+ bool "Paravirt queued spinlock statistics"
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ ---help---
+ Enable the collection of statistical data on the slowpath
+ behavior of paravirtualized queued spinlocks and report
+ them on debugfs.
+
source "arch/x86/xen/Kconfig"
config KVM_GUEST
regs->ip = landing_pad;
/*
- * Fetch ECX from where the vDSO stashed it.
+ * Fetch EBP from where the vDSO stashed it.
*
* WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
*/
* Micro-optimization: the pointer we're following is explicitly
* 32 bits, so it can't be out of range.
*/
- __get_user(*(u32 *)®s->cx,
+ __get_user(*(u32 *)®s->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
- get_user(*(u32 *)®s->cx,
+ get_user(*(u32 *)®s->bp,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
) {
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
- pushl %ecx /* pt_regs->cx */
+ pushl %ebp /* pt_regs->sp (stashed in bp) */
pushfl /* pt_regs->flags (except IF = 0) */
orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
pushl $__USER_CS /* pt_regs->cs */
movl %esp, %eax
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSEXIT */
TRACE_IRQS_ON /* User mode traces as IRQs on. */
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rcx /* pt_regs->sp */
+ pushq %rbp /* pt_regs->sp (stashed in bp) */
/*
* Push flags. This is nasty. First, interrupts are currently
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 = 0 */
pushq %r8 /* pt_regs->r9 = 0 */
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
jmp sysret32_from_system_call
sysenter_fix_flags:
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx (will be overwritten) */
+ pushq %rbp /* pt_regs->cx (stashed in bp) */
pushq $-ENOSYS /* pt_regs->ax */
xorq %r8,%r8
pushq %r8 /* pt_regs->r8 = 0 */
pushq %r8 /* pt_regs->r10 = 0 */
pushq %r8 /* pt_regs->r11 = 0 */
pushq %rbx /* pt_regs->rbx */
- pushq %rbp /* pt_regs->rbp */
+ pushq %rbp /* pt_regs->rbp (will be overwritten) */
pushq %r8 /* pt_regs->r12 = 0 */
pushq %r8 /* pt_regs->r13 = 0 */
pushq %r8 /* pt_regs->r14 = 0 */
movq %rsp, %rdi
call do_fast_syscall_32
- testl %eax, %eax
- jz .Lsyscall_32_done
+ /* XEN PV guests always use IRET path */
+ ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
+ "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
/*
- * Code for the vDSO. This version uses the old int $0x80 method.
+ * AT_SYSINFO entry point
*/
#include <asm/dwarf2.h>
/*
* Reshuffle regs so that all of any of the entry instructions
* will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may eludicate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
*/
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
- pushl %ecx
+ pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx, 0
- movl %esp, %ecx
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
#ifdef CONFIG_X86_64
/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
- ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
- "syscall", X86_FEATURE_SYSCALL32
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
#else
- ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
#endif
/* Enter using int $0x80 */
- movl (%esp), %ecx
int $0x80
GLOBAL(int80_landing_pad)
- /* Restore ECX and EDX in case they were clobbered. */
- popl %ecx
- CFI_RESTORE ecx
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
CFI_ADJUST_CFA_OFFSET -4
popl %edx
CFI_RESTORE edx
CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
ret
CFI_ENDPROC
#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
return pv_info.paravirt_enabled;
}
+static inline int paravirt_has_feature(unsigned int feature)
+{
+ WARN_ON_ONCE(!pv_info.paravirt_enabled);
+ return (pv_info.features & feature);
+}
+
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
{
#endif
int paravirt_enabled;
+ unsigned int features; /* valid only if paravirt_enabled is set */
const char *name;
};
+#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x)
+/* Supported features */
+#define PV_SUPPORTED_RTC (1<<0)
+
struct pv_init_ops {
/*
* Patch may replace one of the defined code sequences with
#else
#define __cpuid native_cpuid
#define paravirt_enabled() 0
+#define paravirt_has(x) 0
static inline void load_sp0(struct tss_struct *tss,
struct thread_struct *thread)
#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ * struct __qspinlock *l = (void *)lock;
+ * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ * if (likely(lockval == _Q_LOCKED_VAL))
+ * return;
+ * pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ * rdi = lock (first argument)
+ * rsi = lockval (second argument)
+ * rdx = internal variable (set to 0)
+ */
+asm (".pushsection .text;"
+ ".globl " PV_UNLOCK ";"
+ ".align 4,0x90;"
+ PV_UNLOCK ": "
+ "push %rdx;"
+ "mov $0x1,%eax;"
+ "xor %edx,%edx;"
+ "lock cmpxchg %dl,(%rdi);"
+ "cmp $0x1,%al;"
+ "jne .slowpath;"
+ "pop %rdx;"
+ "ret;"
+ ".slowpath: "
+ "push %rsi;"
+ "movzbl %al,%esi;"
+ "call " PV_UNLOCK_SLOWPATH ";"
+ "pop %rsi;"
+ "pop %rdx;"
+ "ret;"
+ ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+ ".popsection");
+
+#else /* CONFIG_64BIT */
+
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
#endif
case 1:
init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
numachip_apic_icr_write = numachip1_apic_icr_write;
- x86_init.pci.arch_init = pci_numachip_init;
break;
case 2:
init_extra_mapping_uc(NUMACHIP2_LCSR_BASE, NUMACHIP2_LCSR_SIZE);
numachip_apic_icr_write = numachip2_apic_icr_write;
-
- /* Use MCFG config cycles rather than locked CF8 cycles */
- raw_pci_ops = &pci_mmcfg;
break;
default:
return 0;
}
x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+ x86_init.pci.arch_init = pci_numachip_init;
return 0;
}
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
+ /* If this CPU is offline, just bail out. */
+ if (cpu_is_offline(smp_processor_id())) {
+ u64 mcgstatus;
+
+ mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ return;
+ }
+ }
+
ist_enter(regs);
this_cpu_inc(mce_exception_count);
}
#endif
+ if (paravirt_enabled() && !paravirt_has(RTC))
+ return -ENODEV;
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->edx & bit(X86_FEATURE_MTRR));
+}
+
static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
u8 saved_mode;
if (hpet_legacy_start) {
/* save existing mode for later reenablement */
+ WARN_ON(channel != 0);
saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
pit_load_count(kvm, channel, val);
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
-static u8 mtrr_disabled_type(void)
+static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
{
/*
* Intel SDM 11.11.2.2: all MTRRs are disabled when
* IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
* memory type is applied to all of physical memory.
+ *
+ * However, virtual machines can be run with CPUID such that
+ * there are no MTRRs. In that case, the firmware will never
+ * enable MTRRs and it is obviously undesirable to run the
+ * guest entirely with UC memory and we use WB.
*/
- return MTRR_TYPE_UNCACHABLE;
+ if (guest_cpuid_has_mtrr(vcpu))
+ return MTRR_TYPE_UNCACHABLE;
+ else
+ return MTRR_TYPE_WRBACK;
}
/*
for (seg = 0; seg < seg_num; seg++) {
mtrr_seg = &fixed_seg_table[seg];
- if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
return seg;
}
*start = range->base & PAGE_MASK;
mask = range->mask & PAGE_MASK;
- mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
/* This cannot overflow because writing to the reserved bits of
* variable MTRRs causes a #GP.
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
+ /* Extend the mask with all 1 bits to the left, since those
+ * bits must implicitly be 0. The bits are then cleared
+ * when reading them.
+ */
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data;
+ cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+
+ *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
}
return 0;
}
if (iter.mtrr_disabled)
- return mtrr_disabled_type();
+ return mtrr_disabled_type(vcpu);
/* not contained in any MTRRs. */
if (type == -1)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
- trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
-
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Otherwise falls through */
default:
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_TSC_AUX:
- if (!guest_cpuid_has_rdtscp(vcpu))
+ if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated)
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
/*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
int start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
+ trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
__kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
pv_info.kernel_rpl = 1;
/* Everyone except Xen runs with this set. */
pv_info.shared_kernel_pmd = 1;
+ pv_info.features = 0;
/*
* We set up all the lguest overrides for sensitive operations. These
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
- { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+ { 0/*PKMAP_BASE*/, "Persistent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
struct sigcontext __user *sc = &frame->sc;
int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
- if (copy_from_user(&set.sig[0], (void *)sc->oldmask, sizeof(set.sig[0])) ||
+ if (copy_from_user(&set.sig[0], &sc->oldmask, sizeof(set.sig[0])) ||
copy_from_user(&set.sig[1], frame->extramask, sig_size))
goto segfault;
#ifdef CONFIG_X86_64
.extra_user_64bit_cs = FLAT_USER_CS64,
#endif
-
+ .features = 0,
.name = "Xen",
};
/* Install Xen paravirt ops */
pv_info = xen_info;
+ if (xen_initial_domain())
+ pv_info.features |= PV_SUPPORTED_RTC;
pv_init_ops = xen_init_ops;
pv_apic_ops = xen_apic_ops;
if (!xen_pvh_domain()) {
static void xen_set_cpu_features(struct cpuinfo_x86 *c)
{
- if (xen_pv_domain())
+ if (xen_pv_domain()) {
clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ set_cpu_cap(c, X86_FEATURE_XENPV);
+ }
}
const struct hypervisor_x86 x86_hyper_xen = {
{
x86_init.paging.pagetable_init = xen_pagetable_init;
- /* Optimization - we can use the HVM one but it has no idea which
- * VCPUs are descheduled - which means that it will needlessly IPI
- * them. Xen knows so let it do the job.
- */
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
- pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- }
+
pv_mmu_ops = xen_mmu_ops;
memset(dummy_mapping, 0xff, PAGE_SIZE);
#include <linux/types.h>
#include <linux/tick.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
void xen_arch_pre_suspend(void)
{
- int cpu;
-
- for_each_online_cpu(cpu)
- xen_pmu_finish(cpu);
-
if (xen_pv_domain())
xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- int cpu;
-
if (xen_pv_domain())
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
-
- for_each_online_cpu(cpu)
- xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
void xen_arch_resume(void)
{
+ int cpu;
+
on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
void xen_arch_suspend(void)
{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
}
EXPORT_SYMBOL(blk_delay_queue);
/**
+ * blk_start_queue_async - asynchronously restart a previously stopped queue
+ * @q: The &struct request_queue in question
+ *
+ * Description:
+ * blk_start_queue_async() will clear the stop flag on the queue, and
+ * ensure that the request_fn for the queue is run from an async
+ * context.
+ **/
+void blk_start_queue_async(struct request_queue *q)
+{
+ queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+ blk_run_queue_async(q);
+}
+EXPORT_SYMBOL(blk_start_queue_async);
+
+/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
*
struct request *req;
unsigned int request_count = 0;
- blk_queue_split(q, &bio, q->bio_split);
-
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
*/
blk_queue_bounce(q, &bio);
+ blk_queue_split(q, &bio, q->bio_split);
+
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio->bi_error = -EIO;
bio_endio(bio);
if (WARN_ON_ONCE(in_irq()))
return -EDEADLK;
+ walk->iv = req->info;
walk->nbytes = walk->total;
if (unlikely(!walk->total))
return 0;
walk->iv_buffer = NULL;
- walk->iv = req->info;
if (unlikely(((unsigned long)walk->iv & alignmask))) {
int err = ablkcipher_copy_iv(walk, tfm, alignmask);
bool merge;
bool enc;
- struct ablkcipher_request req;
+ struct skcipher_request req;
};
struct skcipher_async_rsgl {
};
#define GET_SREQ(areq, ctx) (struct skcipher_async_req *)((char *)areq + \
- crypto_ablkcipher_reqsize(crypto_ablkcipher_reqtfm(&ctx->req)))
+ crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req)))
#define GET_REQ_SIZE(ctx) \
- crypto_ablkcipher_reqsize(crypto_ablkcipher_reqtfm(&ctx->req))
+ crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req))
#define GET_IV_SIZE(ctx) \
- crypto_ablkcipher_ivsize(crypto_ablkcipher_reqtfm(&ctx->req))
+ crypto_skcipher_ivsize(crypto_skcipher_reqtfm(&ctx->req))
#define MAX_SGL_ENTS ((4096 - sizeof(struct skcipher_sg_list)) / \
sizeof(struct scatterlist) - 1)
struct sock *sk = sock->sk;
struct alg_sock *ask = alg_sk(sk);
struct skcipher_ctx *ctx = ask->private;
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
- unsigned ivsize = crypto_ablkcipher_ivsize(tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(&ctx->req);
+ unsigned ivsize = crypto_skcipher_ivsize(tfm);
struct skcipher_sg_list *sgl;
struct af_alg_control con = {};
long copied = 0;
struct skcipher_sg_list *sgl;
struct scatterlist *sg;
struct skcipher_async_req *sreq;
- struct ablkcipher_request *req;
+ struct skcipher_request *req;
struct skcipher_async_rsgl *last_rsgl = NULL;
unsigned int txbufs = 0, len = 0, tx_nents = skcipher_all_sg_nents(ctx);
unsigned int reqlen = sizeof(struct skcipher_async_req) +
}
sg_init_table(sreq->tsg, tx_nents);
memcpy(sreq->iv, ctx->iv, GET_IV_SIZE(ctx));
- ablkcipher_request_set_tfm(req, crypto_ablkcipher_reqtfm(&ctx->req));
- ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- skcipher_async_cb, sk);
+ skcipher_request_set_tfm(req, crypto_skcipher_reqtfm(&ctx->req));
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ skcipher_async_cb, sk);
while (iov_iter_count(&msg->msg_iter)) {
struct skcipher_async_rsgl *rsgl;
if (mark)
sg_mark_end(sreq->tsg + txbufs - 1);
- ablkcipher_request_set_crypt(req, sreq->tsg, sreq->first_sgl.sgl.sg,
- len, sreq->iv);
- err = ctx->enc ? crypto_ablkcipher_encrypt(req) :
- crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, sreq->tsg, sreq->first_sgl.sgl.sg,
+ len, sreq->iv);
+ err = ctx->enc ? crypto_skcipher_encrypt(req) :
+ crypto_skcipher_decrypt(req);
if (err == -EINPROGRESS) {
atomic_inc(&ctx->inflight);
err = -EIOCBQUEUED;
struct sock *sk = sock->sk;
struct alg_sock *ask = alg_sk(sk);
struct skcipher_ctx *ctx = ask->private;
- unsigned bs = crypto_ablkcipher_blocksize(crypto_ablkcipher_reqtfm(
+ unsigned bs = crypto_skcipher_blocksize(crypto_skcipher_reqtfm(
&ctx->req));
struct skcipher_sg_list *sgl;
struct scatterlist *sg;
if (!used)
goto free;
- ablkcipher_request_set_crypt(&ctx->req, sg,
- ctx->rsgl.sg, used,
- ctx->iv);
+ skcipher_request_set_crypt(&ctx->req, sg, ctx->rsgl.sg, used,
+ ctx->iv);
err = af_alg_wait_for_completion(
ctx->enc ?
- crypto_ablkcipher_encrypt(&ctx->req) :
- crypto_ablkcipher_decrypt(&ctx->req),
+ crypto_skcipher_encrypt(&ctx->req) :
+ crypto_skcipher_decrypt(&ctx->req),
&ctx->completion);
free:
static void *skcipher_bind(const char *name, u32 type, u32 mask)
{
- return crypto_alloc_ablkcipher(name, type, mask);
+ return crypto_alloc_skcipher(name, type, mask);
}
static void skcipher_release(void *private)
{
- crypto_free_ablkcipher(private);
+ crypto_free_skcipher(private);
}
static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
{
- return crypto_ablkcipher_setkey(private, key, keylen);
+ return crypto_skcipher_setkey(private, key, keylen);
}
static void skcipher_wait(struct sock *sk)
{
struct alg_sock *ask = alg_sk(sk);
struct skcipher_ctx *ctx = ask->private;
- struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(&ctx->req);
if (atomic_read(&ctx->inflight))
skcipher_wait(sk);
skcipher_free_sgl(sk);
- sock_kzfree_s(sk, ctx->iv, crypto_ablkcipher_ivsize(tfm));
+ sock_kzfree_s(sk, ctx->iv, crypto_skcipher_ivsize(tfm));
sock_kfree_s(sk, ctx, ctx->len);
af_alg_release_parent(sk);
}
{
struct skcipher_ctx *ctx;
struct alg_sock *ask = alg_sk(sk);
- unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(private);
+ unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(private);
ctx = sock_kmalloc(sk, len, GFP_KERNEL);
if (!ctx)
return -ENOMEM;
- ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(private),
+ ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(private),
GFP_KERNEL);
if (!ctx->iv) {
sock_kfree_s(sk, ctx, len);
return -ENOMEM;
}
- memset(ctx->iv, 0, crypto_ablkcipher_ivsize(private));
+ memset(ctx->iv, 0, crypto_skcipher_ivsize(private));
INIT_LIST_HEAD(&ctx->tsgl);
ctx->len = len;
ask->private = ctx;
- ablkcipher_request_set_tfm(&ctx->req, private);
- ablkcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- af_alg_complete, &ctx->completion);
+ skcipher_request_set_tfm(&ctx->req, private);
+ skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ af_alg_complete, &ctx->completion);
sk->sk_destruct = skcipher_sock_destruct;
struct dmaengine_unmap_data *unmap = NULL;
if (device)
- unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(device->dev, 2, GFP_NOWAIT);
if (unmap && is_dma_copy_aligned(device, src_offset, dest_offset, len)) {
unsigned long dma_prep_flags = 0;
BUG_ON(disks > 255 || !(P(blocks, disks) || Q(blocks, disks)));
if (device)
- unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOWAIT);
/* XORing P/Q is only implemented in software */
if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
BUG_ON(disks < 4);
if (device)
- unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOWAIT);
if (unmap && disks <= dma_maxpq(device, 0) &&
is_dma_pq_aligned(device, offset, 0, len)) {
u8 *a, *b, *c;
if (dma)
- unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOWAIT);
if (unmap) {
struct device *dev = dma->dev;
u8 *d, *s;
if (dma)
- unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(dma->dev, 3, GFP_NOWAIT);
if (unmap) {
dma_addr_t dma_dest[2];
BUG_ON(src_cnt <= 1);
if (device)
- unmap = dmaengine_get_unmap_data(device->dev, src_cnt+1, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(device->dev, src_cnt+1, GFP_NOWAIT);
if (unmap && is_dma_xor_aligned(device, offset, 0, len)) {
struct dma_async_tx_descriptor *tx;
BUG_ON(src_cnt <= 1);
if (device)
- unmap = dmaengine_get_unmap_data(device->dev, src_cnt, GFP_NOIO);
+ unmap = dmaengine_get_unmap_data(device->dev, src_cnt, GFP_NOWAIT);
if (unmap && src_cnt <= device->max_xor &&
is_dma_xor_aligned(device, offset, 0, len)) {
if (WARN_ON_ONCE(in_irq()))
return -EDEADLK;
+ walk->iv = desc->info;
walk->nbytes = walk->total;
if (unlikely(!walk->total))
return 0;
walk->buffer = NULL;
- walk->iv = desc->info;
if (unlikely(((unsigned long)walk->iv & walk->alignmask))) {
int err = blkcipher_copy_iv(walk);
if (err)
init_completion(&dn->kobj_done);
ret = kobject_init_and_add(&dn->kobj, &acpi_data_node_ktype,
- kobj, dn->name);
+ kobj, "%s", dn->name);
if (ret)
acpi_handle_err(dn->handle, "Failed to expose (%d)\n", ret);
else
if (!dev->driver) {
/* dev->driver may be null if we're being removed */
dev_dbg(dev, "%s: no driver found for dev\n", __func__);
- return;
+ goto out_unlock;
}
if (!acpi_desc) {
goto err_remove_sysfs_thermal;
}
- sysfs_remove_link(&pr->cdev->device.kobj, "device");
+ return 0;
+
err_remove_sysfs_thermal:
sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
err_thermal_unregister:
struct generic_pm_domain *genpd;
bool (*stop_ok)(struct device *__dev);
struct gpd_timing_data *td = &dev_gpd_data(dev)->td;
+ bool runtime_pm = pm_runtime_enabled(dev);
ktime_t time_start;
s64 elapsed_ns;
int ret;
if (IS_ERR(genpd))
return -EINVAL;
+ /*
+ * A runtime PM centric subsystem/driver may re-use the runtime PM
+ * callbacks for other purposes than runtime PM. In those scenarios
+ * runtime PM is disabled. Under these circumstances, we shall skip
+ * validating/measuring the PM QoS latency.
+ */
stop_ok = genpd->gov ? genpd->gov->stop_ok : NULL;
- if (stop_ok && !stop_ok(dev))
+ if (runtime_pm && stop_ok && !stop_ok(dev))
return -EBUSY;
/* Measure suspend latency. */
- time_start = ktime_get();
+ if (runtime_pm)
+ time_start = ktime_get();
ret = genpd_save_dev(genpd, dev);
if (ret)
}
/* Update suspend latency value if the measured time exceeds it. */
- elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
- if (elapsed_ns > td->suspend_latency_ns) {
- td->suspend_latency_ns = elapsed_ns;
- dev_dbg(dev, "suspend latency exceeded, %lld ns\n",
- elapsed_ns);
- genpd->max_off_time_changed = true;
- td->constraint_changed = true;
+ if (runtime_pm) {
+ elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
+ if (elapsed_ns > td->suspend_latency_ns) {
+ td->suspend_latency_ns = elapsed_ns;
+ dev_dbg(dev, "suspend latency exceeded, %lld ns\n",
+ elapsed_ns);
+ genpd->max_off_time_changed = true;
+ td->constraint_changed = true;
+ }
}
/*
{
struct generic_pm_domain *genpd;
struct gpd_timing_data *td = &dev_gpd_data(dev)->td;
+ bool runtime_pm = pm_runtime_enabled(dev);
ktime_t time_start;
s64 elapsed_ns;
int ret;
out:
/* Measure resume latency. */
- if (timed)
+ if (timed && runtime_pm)
time_start = ktime_get();
genpd_start_dev(genpd, dev);
genpd_restore_dev(genpd, dev);
/* Update resume latency value if the measured time exceeds it. */
- if (timed) {
+ if (timed && runtime_pm) {
elapsed_ns = ktime_to_ns(ktime_sub(ktime_get(), time_start));
if (elapsed_ns > td->resume_latency_ns) {
td->resume_latency_ns = elapsed_ns;
{
struct request_queue *q = NULL;
+ if (cmd->rq)
+ q = cmd->rq->q;
+
switch (queue_mode) {
case NULL_Q_MQ:
blk_mq_end_request(cmd->rq, 0);
break;
case NULL_Q_BIO:
bio_endio(cmd->bio);
- goto free_cmd;
+ break;
}
- if (cmd->rq)
- q = cmd->rq->q;
+ free_cmd(cmd);
/* Restart queue if needed, as we are freeing a tag */
- if (q && !q->mq_ops && blk_queue_stopped(q)) {
+ if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) {
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
- if (blk_queue_stopped(q))
- blk_start_queue(q);
+ blk_start_queue_async(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
-free_cmd:
- free_cmd(cmd);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
goto unmap;
for (n = 0, i = 0; n < nseg; n++) {
+ uint8_t first_sect, last_sect;
+
if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
/* Map indirect segments */
if (segments)
segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
}
i = n % SEGS_PER_INDIRECT_FRAME;
+
pending_req->segments[n]->gref = segments[i].gref;
- seg[n].nsec = segments[i].last_sect -
- segments[i].first_sect + 1;
- seg[n].offset = (segments[i].first_sect << 9);
- if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
- (segments[i].last_sect < segments[i].first_sect)) {
+
+ first_sect = READ_ONCE(segments[i].first_sect);
+ last_sect = READ_ONCE(segments[i].last_sect);
+ if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
rc = -EINVAL;
goto unmap;
}
+
+ seg[n].nsec = last_sect - first_sect + 1;
+ seg[n].offset = first_sect << 9;
preq->nr_sects += seg[n].nsec;
}
struct blkif_x86_32_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
- dst->operation = src->operation;
- switch (src->operation) {
+ dst->operation = READ_ONCE(src->operation);
+ switch (dst->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
struct blkif_x86_64_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
- dst->operation = src->operation;
- switch (src->operation) {
+ dst->operation = READ_ONCE(src->operation);
+ switch (dst->operation) {
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
ret = _sunxi_rsb_run_xfer(rsb);
if (ret)
- goto out;
+ goto unlock;
*buf = readl(rsb->regs + RSB_DATA);
+unlock:
mutex_unlock(&rsb->lock);
-out:
return ret;
}
*/
static const struct sunxi_rsb_addr_map sunxi_rsb_addr_maps[] = {
- { 0x3e3, 0x2d }, /* Primary PMIC: AXP223, AXP809, AXP81X, ... */
+ { 0x3a3, 0x2d }, /* Primary PMIC: AXP223, AXP809, AXP81X, ... */
{ 0x745, 0x3a }, /* Secondary PMIC: AXP806, ... */
- { 0xe89, 0x45 }, /* Peripheral IC: AC100, ... */
+ { 0xe89, 0x4e }, /* Peripheral IC: AC100, ... */
};
static u8 sunxi_rsb_get_rtaddr(u16 hwaddr)
*
* It checks skb, netlink header and msg sizes, and calls callback helper.
*/
-static void cn_rx_skb(struct sk_buff *__skb)
+static void cn_rx_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
- struct sk_buff *skb;
int len, err;
- skb = skb_get(__skb);
-
if (skb->len >= NLMSG_HDRLEN) {
nlh = nlmsg_hdr(skb);
len = nlmsg_len(nlh);
if (len < (int)sizeof(struct cn_msg) ||
skb->len < nlh->nlmsg_len ||
- len > CONNECTOR_MAX_MSG_SIZE) {
- kfree_skb(skb);
+ len > CONNECTOR_MAX_MSG_SIZE)
return;
- }
- err = cn_call_callback(skb);
+ err = cn_call_callback(skb_get(skb));
if (err < 0)
kfree_skb(skb);
}
config ARM_TEGRA124_CPUFREQ
tristate "Tegra124 CPUFreq support"
- depends on ARCH_TEGRA && CPUFREQ_DT
+ depends on ARCH_TEGRA && CPUFREQ_DT && REGULATOR
default y
help
This adds the CPUFreq driver support for Tegra124 SOCs.
limits->max_sysfs_pct);
limits->max_perf_pct = max(limits->min_policy_pct,
limits->max_perf_pct);
- limits->max_perf = round_up(limits->max_perf, 8);
+ limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
/* Make sure min_perf_pct <= max_perf_pct */
limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
static struct scpi_dvfs_info *scpi_get_dvfs_info(struct device *cpu_dev)
{
- u8 domain = topology_physical_package_id(cpu_dev->id);
+ int domain = topology_physical_package_id(cpu_dev->id);
if (domain < 0)
return ERR_PTR(-EINVAL);
#define AT_XDMAC_CC_WRIP (0x1 << 23) /* Write in Progress (read only) */
#define AT_XDMAC_CC_WRIP_DONE (0x0 << 23)
#define AT_XDMAC_CC_WRIP_IN_PROGRESS (0x1 << 23)
-#define AT_XDMAC_CC_PERID(i) (0x7f & (h) << 24) /* Channel Peripheral Identifier */
+#define AT_XDMAC_CC_PERID(i) (0x7f & (i) << 24) /* Channel Peripheral Identifier */
#define AT_XDMAC_CDS_MSP 0x2C /* Channel Data Stride Memory Set Pattern */
#define AT_XDMAC_CSUS 0x30 /* Channel Source Microblock Stride */
#define AT_XDMAC_CDUS 0x34 /* Channel Destination Microblock Stride */
NULL,
src_addr, dst_addr,
xt, xt->sgl);
- for (i = 0; i < xt->numf; i++)
+
+ /* Length of the block is (BLEN+1) microblocks. */
+ for (i = 0; i < xt->numf - 1; i++)
at_xdmac_increment_block_count(chan, first);
dev_dbg(chan2dev(chan), "%s: add desc 0x%p to descs_list 0x%p\n",
/* Check remaining length and change data width if needed. */
dwidth = at_xdmac_align_width(chan,
src_addr | dst_addr | xfer_size);
+ chan_cc &= ~AT_XDMAC_CC_DWIDTH_MASK;
chan_cc |= AT_XDMAC_CC_DWIDTH(dwidth);
ublen = xfer_size >> dwidth;
* since we don't care about the stride anymore.
*/
if ((i == (sg_len - 1)) &&
- sg_dma_len(ppsg) == sg_dma_len(psg)) {
+ sg_dma_len(psg) == sg_dma_len(sg)) {
dev_dbg(chan2dev(chan),
"%s: desc 0x%p can be merged with desc 0x%p\n",
__func__, desc, pdesc);
*/
#include <linux/dmaengine.h>
#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/interrupt.h>
uint32_t pad[2];
};
+struct bcm2835_cb_entry {
+ struct bcm2835_dma_cb *cb;
+ dma_addr_t paddr;
+};
+
struct bcm2835_chan {
struct virt_dma_chan vc;
struct list_head node;
int ch;
struct bcm2835_desc *desc;
+ struct dma_pool *cb_pool;
void __iomem *chan_base;
int irq_number;
};
struct bcm2835_desc {
+ struct bcm2835_chan *c;
struct virt_dma_desc vd;
enum dma_transfer_direction dir;
- unsigned int control_block_size;
- struct bcm2835_dma_cb *control_block_base;
- dma_addr_t control_block_base_phys;
+ struct bcm2835_cb_entry *cb_list;
unsigned int frames;
size_t size;
static void bcm2835_dma_desc_free(struct virt_dma_desc *vd)
{
struct bcm2835_desc *desc = container_of(vd, struct bcm2835_desc, vd);
- dma_free_coherent(desc->vd.tx.chan->device->dev,
- desc->control_block_size,
- desc->control_block_base,
- desc->control_block_base_phys);
+ int i;
+
+ for (i = 0; i < desc->frames; i++)
+ dma_pool_free(desc->c->cb_pool, desc->cb_list[i].cb,
+ desc->cb_list[i].paddr);
+
+ kfree(desc->cb_list);
kfree(desc);
}
c->desc = d = to_bcm2835_dma_desc(&vd->tx);
- writel(d->control_block_base_phys, c->chan_base + BCM2835_DMA_ADDR);
+ writel(d->cb_list[0].paddr, c->chan_base + BCM2835_DMA_ADDR);
writel(BCM2835_DMA_ACTIVE, c->chan_base + BCM2835_DMA_CS);
}
static int bcm2835_dma_alloc_chan_resources(struct dma_chan *chan)
{
struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
+ struct device *dev = c->vc.chan.device->dev;
+
+ dev_dbg(dev, "Allocating DMA channel %d\n", c->ch);
- dev_dbg(c->vc.chan.device->dev,
- "Allocating DMA channel %d\n", c->ch);
+ c->cb_pool = dma_pool_create(dev_name(dev), dev,
+ sizeof(struct bcm2835_dma_cb), 0, 0);
+ if (!c->cb_pool) {
+ dev_err(dev, "unable to allocate descriptor pool\n");
+ return -ENOMEM;
+ }
return request_irq(c->irq_number,
bcm2835_dma_callback, 0, "DMA IRQ", c);
vchan_free_chan_resources(&c->vc);
free_irq(c->irq_number, c);
+ dma_pool_destroy(c->cb_pool);
dev_dbg(c->vc.chan.device->dev, "Freeing DMA channel %u\n", c->ch);
}
size_t size;
for (size = i = 0; i < d->frames; i++) {
- struct bcm2835_dma_cb *control_block =
- &d->control_block_base[i];
+ struct bcm2835_dma_cb *control_block = d->cb_list[i].cb;
size_t this_size = control_block->length;
dma_addr_t dma;
dma_addr_t dev_addr;
unsigned int es, sync_type;
unsigned int frame;
+ int i;
/* Grab configuration */
if (!is_slave_direction(direction)) {
if (!d)
return NULL;
+ d->c = c;
d->dir = direction;
d->frames = buf_len / period_len;
- /* Allocate memory for control blocks */
- d->control_block_size = d->frames * sizeof(struct bcm2835_dma_cb);
- d->control_block_base = dma_zalloc_coherent(chan->device->dev,
- d->control_block_size, &d->control_block_base_phys,
- GFP_NOWAIT);
-
- if (!d->control_block_base) {
+ d->cb_list = kcalloc(d->frames, sizeof(*d->cb_list), GFP_KERNEL);
+ if (!d->cb_list) {
kfree(d);
return NULL;
}
+ /* Allocate memory for control blocks */
+ for (i = 0; i < d->frames; i++) {
+ struct bcm2835_cb_entry *cb_entry = &d->cb_list[i];
+
+ cb_entry->cb = dma_pool_zalloc(c->cb_pool, GFP_ATOMIC,
+ &cb_entry->paddr);
+ if (!cb_entry->cb)
+ goto error_cb;
+ }
/*
* Iterate over all frames, create a control block
* for each frame and link them together.
*/
for (frame = 0; frame < d->frames; frame++) {
- struct bcm2835_dma_cb *control_block =
- &d->control_block_base[frame];
+ struct bcm2835_dma_cb *control_block = d->cb_list[frame].cb;
/* Setup adresses */
if (d->dir == DMA_DEV_TO_MEM) {
* This DMA engine driver currently only supports cyclic DMA.
* Therefore, wrap around at number of frames.
*/
- control_block->next = d->control_block_base_phys +
- sizeof(struct bcm2835_dma_cb)
- * ((frame + 1) % d->frames);
+ control_block->next = d->cb_list[((frame + 1) % d->frames)].paddr;
}
return vchan_tx_prep(&c->vc, &d->vd, flags);
+error_cb:
+ i--;
+ for (; i >= 0; i--) {
+ struct bcm2835_cb_entry *cb_entry = &d->cb_list[i];
+
+ dma_pool_free(c->cb_pool, cb_entry->cb, cb_entry->paddr);
+ }
+
+ kfree(d->cb_list);
+ kfree(d);
+ return NULL;
}
static int bcm2835_dma_slave_config(struct dma_chan *chan,
return ret;
}
-static bool edma_is_memcpy_channel(int ch_num, u16 *memcpy_channels)
+static bool edma_is_memcpy_channel(int ch_num, s32 *memcpy_channels)
{
- s16 *memcpy_ch = memcpy_channels;
-
if (!memcpy_channels)
return false;
- while (*memcpy_ch != -1) {
- if (*memcpy_ch == ch_num)
+ while (*memcpy_channels != -1) {
+ if (*memcpy_channels == ch_num)
return true;
- memcpy_ch++;
+ memcpy_channels++;
}
return false;
}
{
struct dma_device *s_ddev = &ecc->dma_slave;
struct dma_device *m_ddev = NULL;
- s16 *memcpy_channels = ecc->info->memcpy_channels;
+ s32 *memcpy_channels = ecc->info->memcpy_channels;
int i, j;
dma_cap_zero(s_ddev->cap_mask);
prop = of_find_property(dev->of_node, "ti,edma-memcpy-channels", &sz);
if (prop) {
const char pname[] = "ti,edma-memcpy-channels";
- size_t nelm = sz / sizeof(s16);
- s16 *memcpy_ch;
+ size_t nelm = sz / sizeof(s32);
+ s32 *memcpy_ch;
- memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s16),
+ memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s32),
GFP_KERNEL);
if (!memcpy_ch)
return ERR_PTR(-ENOMEM);
- ret = of_property_read_u16_array(dev->of_node, pname,
- (u16 *)memcpy_ch, nelm);
+ ret = of_property_read_u32_array(dev->of_node, pname,
+ (u32 *)memcpy_ch, nelm);
if (ret)
return ERR_PTR(ret);
&sz);
if (prop) {
const char pname[] = "ti,edma-reserved-slot-ranges";
+ u32 (*tmp)[2];
s16 (*rsv_slots)[2];
- size_t nelm = sz / sizeof(*rsv_slots);
+ size_t nelm = sz / sizeof(*tmp);
struct edma_rsv_info *rsv_info;
+ int i;
if (!nelm)
return info;
+ tmp = kcalloc(nelm, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ return ERR_PTR(-ENOMEM);
+
rsv_info = devm_kzalloc(dev, sizeof(*rsv_info), GFP_KERNEL);
- if (!rsv_info)
+ if (!rsv_info) {
+ kfree(tmp);
return ERR_PTR(-ENOMEM);
+ }
rsv_slots = devm_kcalloc(dev, nelm + 1, sizeof(*rsv_slots),
GFP_KERNEL);
- if (!rsv_slots)
+ if (!rsv_slots) {
+ kfree(tmp);
return ERR_PTR(-ENOMEM);
+ }
- ret = of_property_read_u16_array(dev->of_node, pname,
- (u16 *)rsv_slots, nelm * 2);
- if (ret)
+ ret = of_property_read_u32_array(dev->of_node, pname,
+ (u32 *)tmp, nelm * 2);
+ if (ret) {
+ kfree(tmp);
return ERR_PTR(ret);
+ }
+ for (i = 0; i < nelm; i++) {
+ rsv_slots[i][0] = tmp[i][0];
+ rsv_slots[i][1] = tmp[i][1];
+ }
rsv_slots[nelm][0] = -1;
rsv_slots[nelm][1] = -1;
+
info->rsv = rsv_info;
info->rsv->rsv_slots = (const s16 (*)[2])rsv_slots;
+
+ kfree(tmp);
}
return info;
#include <linux/dmapool.h>
#include <linux/interrupt.h>
#include <linux/io.h>
+#include <linux/irq.h>
#include <linux/module.h>
#include <linux/of_device.h>
/* Register DMA channel rx irq */
for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) {
chan = &pdma->chan[i];
+ irq_set_status_flags(chan->rx_irq, IRQ_DISABLE_UNLAZY);
ret = devm_request_irq(chan->dev, chan->rx_irq,
xgene_dma_chan_ring_isr,
0, chan->name, chan);
for (j = 0; j < i; j++) {
chan = &pdma->chan[i];
+ irq_clear_status_flags(chan->rx_irq, IRQ_DISABLE_UNLAZY);
devm_free_irq(chan->dev, chan->rx_irq, chan);
}
for (i = 0; i < XGENE_DMA_MAX_CHANNEL; i++) {
chan = &pdma->chan[i];
+ irq_clear_status_flags(chan->rx_irq, IRQ_DISABLE_UNLAZY);
devm_free_irq(chan->dev, chan->rx_irq, chan);
}
}
dmi_ver = smbios_ver;
else
dmi_ver = (buf[14] & 0xF0) << 4 | (buf[14] & 0x0F);
+ dmi_ver <<= 8;
dmi_num = get_unaligned_le16(buf + 12);
dmi_len = get_unaligned_le16(buf + 6);
dmi_base = get_unaligned_le32(buf + 8);
if (dmi_walk_early(dmi_decode) == 0) {
if (smbios_ver) {
pr_info("SMBIOS %d.%d present.\n",
- dmi_ver >> 8, dmi_ver & 0xFF);
+ dmi_ver >> 16, (dmi_ver >> 8) & 0xFF);
} else {
smbios_entry_point_size = 15;
memcpy(smbios_entry_point, buf,
smbios_entry_point_size);
pr_info("Legacy DMI %d.%d present.\n",
- dmi_ver >> 8, dmi_ver & 0xFF);
+ dmi_ver >> 16, (dmi_ver >> 8) & 0xFF);
}
- dmi_ver <<= 8;
dmi_format_ids(dmi_ids_string, sizeof(dmi_ids_string));
printk(KERN_DEBUG "DMI: %s\n", dmi_ids_string);
return 0;
__raw_writel(BIT(offset), ctrl->base + AR71XX_GPIO_REG_CLEAR);
__raw_writel(
- __raw_readl(ctrl->base + AR71XX_GPIO_REG_OE) & BIT(offset),
+ __raw_readl(ctrl->base + AR71XX_GPIO_REG_OE) & ~BIT(offset),
ctrl->base + AR71XX_GPIO_REG_OE);
spin_unlock_irqrestore(&ctrl->lock, flags);
unsigned long pinmask = bgc->pin2mask(bgc, gpio);
if (bgc->dir & pinmask)
- return bgc->read_reg(bgc->reg_set) & pinmask;
+ return !!(bgc->read_reg(bgc->reg_set) & pinmask);
else
- return bgc->read_reg(bgc->reg_dat) & pinmask;
+ return !!(bgc->read_reg(bgc->reg_dat) & pinmask);
}
static int bgpio_get(struct gpio_chip *gc, unsigned int gpio)
chip = desc->chip;
offset = gpio_chip_hwgpio(desc);
value = chip->get ? chip->get(chip, offset) : -EIO;
- value = value < 0 ? value : !!value;
+ /*
+ * FIXME: fix all drivers to clamp to [0,1] or return negative,
+ * then change this to:
+ * value = value < 0 ? value : !!value;
+ * so we can properly propagate error codes.
+ */
+ value = !!value;
trace_gpio_value(desc_to_gpio(desc), 1, value);
return value;
}
struct ww_acquire_ctx ticket;
/* user fence */
- struct amdgpu_user_fence uf;
+ struct amdgpu_user_fence uf;
+ struct amdgpu_bo_list_entry uf_entry;
};
struct amdgpu_job {
return 0;
}
+static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,
+ struct drm_amdgpu_cs_chunk_fence *fence_data)
+{
+ struct drm_gem_object *gobj;
+ uint32_t handle;
+
+ handle = fence_data->handle;
+ gobj = drm_gem_object_lookup(p->adev->ddev, p->filp,
+ fence_data->handle);
+ if (gobj == NULL)
+ return -EINVAL;
+
+ p->uf.bo = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj));
+ p->uf.offset = fence_data->offset;
+
+ if (amdgpu_ttm_tt_has_userptr(p->uf.bo->tbo.ttm)) {
+ drm_gem_object_unreference_unlocked(gobj);
+ return -EINVAL;
+ }
+
+ p->uf_entry.robj = amdgpu_bo_ref(p->uf.bo);
+ p->uf_entry.prefered_domains = AMDGPU_GEM_DOMAIN_GTT;
+ p->uf_entry.allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
+ p->uf_entry.priority = 0;
+ p->uf_entry.tv.bo = &p->uf_entry.robj->tbo;
+ p->uf_entry.tv.shared = true;
+
+ drm_gem_object_unreference_unlocked(gobj);
+ return 0;
+}
+
int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
{
union drm_amdgpu_cs *cs = data;
case AMDGPU_CHUNK_ID_FENCE:
size = sizeof(struct drm_amdgpu_cs_chunk_fence);
- if (p->chunks[i].length_dw * sizeof(uint32_t) >= size) {
- uint32_t handle;
- struct drm_gem_object *gobj;
- struct drm_amdgpu_cs_chunk_fence *fence_data;
-
- fence_data = (void *)p->chunks[i].kdata;
- handle = fence_data->handle;
- gobj = drm_gem_object_lookup(p->adev->ddev,
- p->filp, handle);
- if (gobj == NULL) {
- ret = -EINVAL;
- goto free_partial_kdata;
- }
-
- p->uf.bo = gem_to_amdgpu_bo(gobj);
- amdgpu_bo_ref(p->uf.bo);
- drm_gem_object_unreference_unlocked(gobj);
- p->uf.offset = fence_data->offset;
- } else {
+ if (p->chunks[i].length_dw * sizeof(uint32_t) < size) {
ret = -EINVAL;
goto free_partial_kdata;
}
+
+ ret = amdgpu_cs_user_fence_chunk(p, (void *)p->chunks[i].kdata);
+ if (ret)
+ goto free_partial_kdata;
+
break;
case AMDGPU_CHUNK_ID_DEPENDENCIES:
p->vm_bos = amdgpu_vm_get_bos(p->adev, &fpriv->vm,
&p->validated);
+ if (p->uf.bo)
+ list_add(&p->uf_entry.tv.head, &p->validated);
+
if (need_mmap_lock)
down_read(¤t->mm->mmap_sem);
for (i = 0; i < parser->num_ibs; i++)
amdgpu_ib_free(parser->adev, &parser->ibs[i]);
kfree(parser->ibs);
- if (parser->uf.bo)
- amdgpu_bo_unref(&parser->uf.bo);
+ amdgpu_bo_unref(&parser->uf.bo);
+ amdgpu_bo_unref(&parser->uf_entry.robj);
}
static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p,
mode_flags |= DRM_MODE_FLAG_3D_MASK;
list_for_each_entry(mode, &connector->modes, head) {
- mode->status = drm_mode_validate_basic(mode);
+ if (mode->status == MODE_OK)
+ mode->status = drm_mode_validate_basic(mode);
if (mode->status == MODE_OK)
mode->status = drm_mode_validate_size(mode, maxX, maxY);
{
struct exynos_drm_crtc *exynos_crtc = to_exynos_crtc(crtc);
+ if (!state->enable)
+ return 0;
+
if (exynos_crtc->ops->atomic_check)
return exynos_crtc->ops->atomic_check(exynos_crtc, state);
struct drm_i915_private *i915;
struct intel_engine_cs *ring;
- /** GEM sequence number associated with this request. */
- uint32_t seqno;
+ /** GEM sequence number associated with the previous request,
+ * when the HWS breadcrumb is equal to this the GPU is processing
+ * this request.
+ */
+ u32 previous_seqno;
+
+ /** GEM sequence number associated with this request,
+ * when the HWS breadcrumb is equal or greater than this the GPU
+ * has finished processing this request.
+ */
+ u32 seqno;
/** Position in the ringbuffer of the start of the request */
u32 head;
int i915_vma_bind(struct i915_vma *vma, enum i915_cache_level cache_level,
u32 flags);
+void __i915_vma_set_map_and_fenceable(struct i915_vma *vma);
int __must_check i915_vma_unbind(struct i915_vma *vma);
/*
* BEWARE: Do not use the function below unless you can _absolutely_
return (int32_t)(seq1 - seq2) >= 0;
}
+static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
+ bool lazy_coherency)
+{
+ u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
+ return i915_seqno_passed(seqno, req->previous_seqno);
+}
+
static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
bool lazy_coherency)
{
- u32 seqno;
-
- BUG_ON(req == NULL);
-
- seqno = req->ring->get_seqno(req->ring, lazy_coherency);
-
+ u32 seqno = req->ring->get_seqno(req->ring, lazy_coherency);
return i915_seqno_passed(seqno, req->seqno);
}
return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
}
-static int __i915_spin_request(struct drm_i915_gem_request *req)
+static unsigned long local_clock_us(unsigned *cpu)
+{
+ unsigned long t;
+
+ /* Cheaply and approximately convert from nanoseconds to microseconds.
+ * The result and subsequent calculations are also defined in the same
+ * approximate microseconds units. The principal source of timing
+ * error here is from the simple truncation.
+ *
+ * Note that local_clock() is only defined wrt to the current CPU;
+ * the comparisons are no longer valid if we switch CPUs. Instead of
+ * blocking preemption for the entire busywait, we can detect the CPU
+ * switch and use that as indicator of system load and a reason to
+ * stop busywaiting, see busywait_stop().
+ */
+ *cpu = get_cpu();
+ t = local_clock() >> 10;
+ put_cpu();
+
+ return t;
+}
+
+static bool busywait_stop(unsigned long timeout, unsigned cpu)
+{
+ unsigned this_cpu;
+
+ if (time_after(local_clock_us(&this_cpu), timeout))
+ return true;
+
+ return this_cpu != cpu;
+}
+
+static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
{
unsigned long timeout;
+ unsigned cpu;
+
+ /* When waiting for high frequency requests, e.g. during synchronous
+ * rendering split between the CPU and GPU, the finite amount of time
+ * required to set up the irq and wait upon it limits the response
+ * rate. By busywaiting on the request completion for a short while we
+ * can service the high frequency waits as quick as possible. However,
+ * if it is a slow request, we want to sleep as quickly as possible.
+ * The tradeoff between waiting and sleeping is roughly the time it
+ * takes to sleep on a request, on the order of a microsecond.
+ */
- if (i915_gem_request_get_ring(req)->irq_refcount)
+ if (req->ring->irq_refcount)
return -EBUSY;
- timeout = jiffies + 1;
+ /* Only spin if we know the GPU is processing this request */
+ if (!i915_gem_request_started(req, true))
+ return -EAGAIN;
+
+ timeout = local_clock_us(&cpu) + 5;
while (!need_resched()) {
if (i915_gem_request_completed(req, true))
return 0;
- if (time_after_eq(jiffies, timeout))
+ if (signal_pending_state(state, current))
+ break;
+
+ if (busywait_stop(timeout, cpu))
break;
cpu_relax_lowlatency();
}
+
if (i915_gem_request_completed(req, false))
return 0;
struct drm_i915_private *dev_priv = dev->dev_private;
const bool irq_test_in_progress =
ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
+ int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
DEFINE_WAIT(wait);
unsigned long timeout_expire;
s64 before, now;
before = ktime_get_raw_ns();
/* Optimistic spin for the next jiffie before touching IRQs */
- ret = __i915_spin_request(req);
+ ret = __i915_spin_request(req, state);
if (ret == 0)
goto out;
for (;;) {
struct timer_list timer;
- prepare_to_wait(&ring->irq_queue, &wait,
- interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&ring->irq_queue, &wait, state);
/* We need to check whether any gpu reset happened in between
* the caller grabbing the seqno and now ... */
break;
}
- if (interruptible && signal_pending(current)) {
+ if (signal_pending_state(state, current)) {
ret = -ERESTARTSYS;
break;
}
request->batch_obj = obj;
request->emitted_jiffies = jiffies;
+ request->previous_seqno = ring->last_submitted_seqno;
ring->last_submitted_seqno = request->seqno;
list_add_tail(&request->list, &ring->request_list);
return false;
}
+void __i915_vma_set_map_and_fenceable(struct i915_vma *vma)
+{
+ struct drm_i915_gem_object *obj = vma->obj;
+ bool mappable, fenceable;
+ u32 fence_size, fence_alignment;
+
+ fence_size = i915_gem_get_gtt_size(obj->base.dev,
+ obj->base.size,
+ obj->tiling_mode);
+ fence_alignment = i915_gem_get_gtt_alignment(obj->base.dev,
+ obj->base.size,
+ obj->tiling_mode,
+ true);
+
+ fenceable = (vma->node.size == fence_size &&
+ (vma->node.start & (fence_alignment - 1)) == 0);
+
+ mappable = (vma->node.start + fence_size <=
+ to_i915(obj->base.dev)->gtt.mappable_end);
+
+ obj->map_and_fenceable = mappable && fenceable;
+}
+
static int
i915_gem_object_do_pin(struct drm_i915_gem_object *obj,
struct i915_address_space *vm,
if (ggtt_view && ggtt_view->type == I915_GGTT_VIEW_NORMAL &&
(bound ^ vma->bound) & GLOBAL_BIND) {
- bool mappable, fenceable;
- u32 fence_size, fence_alignment;
-
- fence_size = i915_gem_get_gtt_size(obj->base.dev,
- obj->base.size,
- obj->tiling_mode);
- fence_alignment = i915_gem_get_gtt_alignment(obj->base.dev,
- obj->base.size,
- obj->tiling_mode,
- true);
-
- fenceable = (vma->node.size == fence_size &&
- (vma->node.start & (fence_alignment - 1)) == 0);
-
- mappable = (vma->node.start + fence_size <=
- dev_priv->gtt.mappable_end);
-
- obj->map_and_fenceable = mappable && fenceable;
-
+ __i915_vma_set_map_and_fenceable(vma);
WARN_ON(flags & PIN_MAPPABLE && !obj->map_and_fenceable);
}
if (!ppgtt)
return;
- WARN_ON(!list_empty(&ppgtt->base.active_list));
-
list_for_each_entry_safe(vma, next, &ppgtt->base.inactive_list,
mm_list) {
if (WARN_ON(__i915_vma_unbind_no_wait(vma)))
return ret;
}
vma->bound |= GLOBAL_BIND;
+ __i915_vma_set_map_and_fenceable(vma);
list_add_tail(&vma->mm_list, &ggtt_vm->inactive_list);
}
}
vma->bound |= GLOBAL_BIND;
+ __i915_vma_set_map_and_fenceable(vma);
list_add_tail(&vma->mm_list, &ggtt->inactive_list);
}
static void ironlake_pfit_disable(struct intel_crtc *crtc, bool force);
static void ironlake_pfit_enable(struct intel_crtc *crtc);
static void intel_modeset_setup_hw_state(struct drm_device *dev);
+static void intel_pre_disable_primary(struct drm_crtc *crtc);
typedef struct {
int min, max;
struct drm_i915_gem_object *obj;
struct drm_plane *primary = intel_crtc->base.primary;
struct drm_plane_state *plane_state = primary->state;
+ struct drm_crtc_state *crtc_state = intel_crtc->base.state;
+ struct intel_plane *intel_plane = to_intel_plane(primary);
struct drm_framebuffer *fb;
if (!plane_config->fb)
}
}
+ /*
+ * We've failed to reconstruct the BIOS FB. Current display state
+ * indicates that the primary plane is visible, but has a NULL FB,
+ * which will lead to problems later if we don't fix it up. The
+ * simplest solution is to just disable the primary plane now and
+ * pretend the BIOS never had it enabled.
+ */
+ to_intel_plane_state(plane_state)->visible = false;
+ crtc_state->plane_mask &= ~(1 << drm_plane_index(primary));
+ intel_pre_disable_primary(&intel_crtc->base);
+ intel_plane->disable_plane(primary, &intel_crtc->base);
+
return;
valid_fb:
if (to_intel_plane_state(crtc->primary->state)->visible) {
intel_crtc_wait_for_pending_flips(crtc);
intel_pre_disable_primary(crtc);
+
+ intel_crtc_disable_planes(crtc, 1 << drm_plane_index(crtc->primary));
+ to_intel_plane_state(crtc->primary->state)->visible = false;
}
- intel_crtc_disable_planes(crtc, crtc->state->plane_mask);
dev_priv->display.crtc_disable(crtc);
intel_crtc->active = false;
intel_update_watermarks(crtc);
return true;
}
-static void i845_update_cursor(struct drm_crtc *crtc, u32 base)
+static void i845_update_cursor(struct drm_crtc *crtc, u32 base, bool on)
{
struct drm_device *dev = crtc->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
uint32_t cntl = 0, size = 0;
- if (base) {
+ if (on) {
unsigned int width = intel_crtc->base.cursor->state->crtc_w;
unsigned int height = intel_crtc->base.cursor->state->crtc_h;
unsigned int stride = roundup_pow_of_two(width) * 4;
}
}
-static void i9xx_update_cursor(struct drm_crtc *crtc, u32 base)
+static void i9xx_update_cursor(struct drm_crtc *crtc, u32 base, bool on)
{
struct drm_device *dev = crtc->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
int pipe = intel_crtc->pipe;
- uint32_t cntl;
+ uint32_t cntl = 0;
- cntl = 0;
- if (base) {
+ if (on) {
cntl = MCURSOR_GAMMA_ENABLE;
switch (intel_crtc->base.cursor->state->crtc_w) {
case 64:
int y = cursor_state->crtc_y;
u32 base = 0, pos = 0;
- if (on)
- base = intel_crtc->cursor_addr;
+ base = intel_crtc->cursor_addr;
if (x >= intel_crtc->config->pipe_src_w)
- base = 0;
+ on = false;
if (y >= intel_crtc->config->pipe_src_h)
- base = 0;
+ on = false;
if (x < 0) {
if (x + cursor_state->crtc_w <= 0)
- base = 0;
+ on = false;
pos |= CURSOR_POS_SIGN << CURSOR_X_SHIFT;
x = -x;
if (y < 0) {
if (y + cursor_state->crtc_h <= 0)
- base = 0;
+ on = false;
pos |= CURSOR_POS_SIGN << CURSOR_Y_SHIFT;
y = -y;
}
pos |= y << CURSOR_Y_SHIFT;
- if (base == 0 && intel_crtc->cursor_base == 0)
- return;
-
I915_WRITE(CURPOS(pipe), pos);
/* ILK+ do this automagically */
}
if (IS_845G(dev) || IS_I865G(dev))
- i845_update_cursor(crtc, base);
+ i845_update_cursor(crtc, base, on);
else
- i9xx_update_cursor(crtc, base);
+ i9xx_update_cursor(crtc, base, on);
}
static bool cursor_size_ok(struct drm_device *dev,
static bool check_digital_port_conflicts(struct drm_atomic_state *state)
{
struct drm_device *dev = state->dev;
- struct intel_encoder *encoder;
struct drm_connector *connector;
- struct drm_connector_state *connector_state;
unsigned int used_ports = 0;
- int i;
/*
* Walk the connector list instead of the encoder
* list to detect the problem on ddi platforms
* where there's just one encoder per digital port.
*/
- for_each_connector_in_state(state, connector, connector_state, i) {
+ drm_for_each_connector(connector, dev) {
+ struct drm_connector_state *connector_state;
+ struct intel_encoder *encoder;
+
+ connector_state = drm_atomic_get_existing_connector_state(state, connector);
+ if (!connector_state)
+ connector_state = connector->state;
+
if (!connector_state->best_encoder)
continue;
struct drm_crtc *crtc = crtc_state->base.crtc;
struct drm_framebuffer *fb = state->base.fb;
struct drm_i915_gem_object *obj = intel_fb_obj(fb);
+ enum pipe pipe = to_intel_plane(plane)->pipe;
unsigned stride;
int ret;
return -EINVAL;
}
+ /*
+ * There's something wrong with the cursor on CHV pipe C.
+ * If it straddles the left edge of the screen then
+ * moving it away from the edge or disabling it often
+ * results in a pipe underrun, and often that can lead to
+ * dead pipe (constant underrun reported, and it scans
+ * out just a solid color). To recover from that, the
+ * display power well must be turned off and on again.
+ * Refuse the put the cursor into that compromised position.
+ */
+ if (IS_CHERRYVIEW(plane->dev) && pipe == PIPE_C &&
+ state->visible && state->base.crtc_x < 0) {
+ DRM_DEBUG_KMS("CHV cursor C not allowed to straddle the left screen edge\n");
+ return -EINVAL;
+ }
+
return 0;
}
crtc = crtc ? crtc : plane->crtc;
intel_crtc = to_intel_crtc(crtc);
- if (intel_crtc->cursor_bo == obj)
- goto update;
-
if (!obj)
addr = 0;
else if (!INTEL_INFO(dev)->cursor_needs_physical)
addr = obj->phys_handle->busaddr;
intel_crtc->cursor_addr = addr;
- intel_crtc->cursor_bo = obj;
-update:
if (crtc->state->active)
intel_crtc_update_cursor(crtc, state->visible);
}
int adjusted_x;
int adjusted_y;
- struct drm_i915_gem_object *cursor_bo;
uint32_t cursor_addr;
uint32_t cursor_cntl;
uint32_t cursor_size;
struct intel_hdmi *intel_hdmi = intel_attached_hdmi(connector);
struct drm_i915_private *dev_priv = to_i915(connector->dev);
bool live_status = false;
- unsigned int retry = 3;
+ unsigned int try;
DRM_DEBUG_KMS("[CONNECTOR:%d:%s]\n",
connector->base.id, connector->name);
intel_display_power_get(dev_priv, POWER_DOMAIN_GMBUS);
- while (!live_status && --retry) {
+ for (try = 0; !live_status && try < 9; try++) {
+ if (try)
+ msleep(10);
live_status = intel_digital_port_connected(dev_priv,
hdmi_to_dig_port(intel_hdmi));
- mdelay(10);
}
if (!live_status)
/* 2b: Program RC6 thresholds.*/
/* WaRsDoubleRc6WrlWithCoarsePowerGating: Doubling WRL only when CPG is enabled */
- if (IS_SKYLAKE(dev) && !((IS_SKL_GT3(dev) || IS_SKL_GT4(dev)) &&
- (INTEL_REVID(dev) <= SKL_REVID_E0)))
+ if (IS_SKYLAKE(dev))
I915_WRITE(GEN6_RC6_WAKE_RATE_LIMIT, 108 << 16);
else
I915_WRITE(GEN6_RC6_WAKE_RATE_LIMIT, 54 << 16);
* WaRsDisableCoarsePowerGating:skl,bxt - Render/Media PG need to be disabled with RC6.
*/
if ((IS_BROXTON(dev) && (INTEL_REVID(dev) < BXT_REVID_B0)) ||
- ((IS_SKL_GT3(dev) || IS_SKL_GT4(dev)) && (INTEL_REVID(dev) <= SKL_REVID_E0)))
+ ((IS_SKL_GT3(dev) || IS_SKL_GT4(dev)) && (INTEL_REVID(dev) <= SKL_REVID_F0)))
I915_WRITE(GEN9_PG_ENABLE, 0);
else
I915_WRITE(GEN9_PG_ENABLE, (rc6_mask & GEN6_RC_CTL_RC6_ENABLE) ?
return -ENOMEM;
nvkm_object_ctor(&nv40_gr_chan, oclass, &chan->object);
chan->gr = gr;
+ chan->fifo = fifoch;
*pobject = &chan->object;
spin_lock_irqsave(&chan->gr->base.engine.lock, flags);
fan->type = NVBIOS_THERM_FAN_UNK;
}
+ fan->fan_mode = NVBIOS_THERM_FAN_LINEAR;
fan->min_duty = nvbios_rd08(bios, data + 0x02);
fan->max_duty = nvbios_rd08(bios, data + 0x03);
dma_addr_t paddr;
int ret;
- /* only doing ARGB32 since this is what is needed to alpha-blend
- * with video overlays:
- */
sizes->surface_bpp = 32;
- sizes->surface_depth = 32;
+ sizes->surface_depth = 24;
DBG("create fbdev: %dx%d@%d (%dx%d)", sizes->surface_width,
sizes->surface_height, sizes->surface_bpp,
config SENSORS_SHT15
tristate "Sensiron humidity and temperature sensors. SHT15 and compat."
depends on GPIOLIB || COMPILE_TEST
+ select BITREVERSE
help
If you say yes here you get support for the Sensiron SHT10, SHT11,
SHT15, SHT71, SHT75 humidity and temperature sensors.
u16 config_orig;
unsigned long last_update;
int temp[3];
+ bool first_time;
};
/* convert left adjusted 13-bit TMP102 register value to milliCelsius */
tmp102->temp[i] = tmp102_reg_to_mC(status);
}
tmp102->last_update = jiffies;
+ tmp102->first_time = false;
}
mutex_unlock(&tmp102->lock);
return tmp102;
{
struct tmp102 *tmp102 = tmp102_update_device(dev);
+ /* Is it too early even to return a conversion? */
+ if (tmp102->first_time) {
+ dev_dbg(dev, "%s: Conversion not ready yet..\n", __func__);
+ return -EAGAIN;
+ }
+
*temp = tmp102->temp[0];
return 0;
struct sensor_device_attribute *sda = to_sensor_dev_attr(attr);
struct tmp102 *tmp102 = tmp102_update_device(dev);
+ /* Is it too early even to return a read? */
+ if (tmp102->first_time)
+ return -EAGAIN;
+
return sprintf(buf, "%d\n", tmp102->temp[sda->index]);
}
status = -ENODEV;
goto fail_restore_config;
}
- tmp102->last_update = jiffies - HZ;
+ tmp102->last_update = jiffies;
+ /* Mark that we are not ready with data until conversion is complete */
+ tmp102->first_time = true;
mutex_init(&tmp102->lock);
hwmon_dev = hwmon_device_register_with_groups(dev, client->name,
* d is always 6 on Keystone I2C controller
*/
- /* get minimum of 7 MHz clock, but max of 12 MHz */
- psc = (input_clock / 7000000) - 1;
+ /*
+ * Both Davinci and current Keystone User Guides recommend a value
+ * between 7MHz and 12MHz. In reality 7MHz module clock doesn't
+ * always produce enough margin between SDA and SCL transitions.
+ * Measurements show that the higher the module clock is, the
+ * bigger is the margin, providing more reliable communication.
+ * So we better target for 12MHz.
+ */
+ psc = (input_clock / 12000000) - 1;
if ((input_clock / (psc + 1)) > 12000000)
psc++; /* better to run under spec than over */
d = (psc >= 2) ? 5 : 7 - psc;
tx_aborted:
if ((stat & (DW_IC_INTR_TX_ABRT | DW_IC_INTR_STOP_DET)) || dev->msg_err)
complete(&dev->cmd_complete);
+ else if (unlikely(dev->accessor_flags & ACCESS_INTR_MASK)) {
+ /* workaround to trigger pending interrupt */
+ stat = dw_readl(dev, DW_IC_INTR_MASK);
+ i2c_dw_disable_int(dev);
+ dw_writel(dev, stat, DW_IC_INTR_MASK);
+ }
return IRQ_HANDLED;
}
#define ACCESS_SWAP 0x00000001
#define ACCESS_16BIT 0x00000002
+#define ACCESS_INTR_MASK 0x00000004
extern int i2c_dw_init(struct dw_i2c_dev *dev);
extern void i2c_dw_disable(struct dw_i2c_dev *dev);
static int dw_i2c_acpi_configure(struct platform_device *pdev)
{
struct dw_i2c_dev *dev = platform_get_drvdata(pdev);
+ const struct acpi_device_id *id;
dev->adapter.nr = -1;
dev->tx_fifo_depth = 32;
dw_i2c_acpi_params(pdev, "FMCN", &dev->fs_hcnt, &dev->fs_lcnt,
&dev->sda_hold_time);
+ id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev);
+ if (id && id->driver_data)
+ dev->accessor_flags |= (u32)id->driver_data;
+
return 0;
}
{ "INT3433", 0 },
{ "80860F41", 0 },
{ "808622C1", 0 },
- { "AMD0010", 0 },
+ { "AMD0010", ACCESS_INTR_MASK },
{ }
};
MODULE_DEVICE_TABLE(acpi, dw_i2c_acpi_match);
}
r = i2c_dw_probe(dev);
- if (r) {
+ if (r && !dev->pm_runtime_disabled)
pm_runtime_disable(&pdev->dev);
- return r;
- }
- return 0;
+ return r;
}
static int dw_i2c_plat_remove(struct platform_device *pdev)
pm_runtime_dont_use_autosuspend(&pdev->dev);
pm_runtime_put_sync(&pdev->dev);
- pm_runtime_disable(&pdev->dev);
+ if (!dev->pm_runtime_disabled)
+ pm_runtime_disable(&pdev->dev);
return 0;
}
i2c_imx, IMX_I2C_I2CR);
imx_i2c_write_reg(i2c_imx->hwdata->i2sr_clr_opcode, i2c_imx, IMX_I2C_I2SR);
+ i2c_imx_init_recovery_info(i2c_imx, pdev);
+
/* Add I2C adapter */
ret = i2c_add_numbered_adapter(&i2c_imx->adapter);
if (ret < 0) {
goto clk_disable;
}
- i2c_imx_init_recovery_info(i2c_imx, pdev);
-
/* Set up platform driver data */
platform_set_drvdata(pdev, i2c_imx);
clk_disable_unprepare(i2c_imx->clk);
bool errata_delay;
struct reset_control *rstc;
bool irq_clear_inverted;
+ /* Clk div is 2 to the power n, not 2 to the power n + 1 */
+ bool clk_n_base_0;
};
static struct mv64xxx_i2c_regs mv64xxx_i2c_regs_mv64xxx = {
#ifdef CONFIG_OF
#ifdef CONFIG_HAVE_CLK
static int
-mv64xxx_calc_freq(const int tclk, const int n, const int m)
+mv64xxx_calc_freq(struct mv64xxx_i2c_data *drv_data,
+ const int tclk, const int n, const int m)
{
- return tclk / (10 * (m + 1) * (2 << n));
+ if (drv_data->clk_n_base_0)
+ return tclk / (10 * (m + 1) * (1 << n));
+ else
+ return tclk / (10 * (m + 1) * (2 << n));
}
static bool
-mv64xxx_find_baud_factors(const u32 req_freq, const u32 tclk, u32 *best_n,
- u32 *best_m)
+mv64xxx_find_baud_factors(struct mv64xxx_i2c_data *drv_data,
+ const u32 req_freq, const u32 tclk)
{
int freq, delta, best_delta = INT_MAX;
int m, n;
for (n = 0; n <= 7; n++)
for (m = 0; m <= 15; m++) {
- freq = mv64xxx_calc_freq(tclk, n, m);
+ freq = mv64xxx_calc_freq(drv_data, tclk, n, m);
delta = req_freq - freq;
if (delta >= 0 && delta < best_delta) {
- *best_m = m;
- *best_n = n;
+ drv_data->freq_m = m;
+ drv_data->freq_n = n;
best_delta = delta;
}
if (best_delta == 0)
if (of_property_read_u32(np, "clock-frequency", &bus_freq))
bus_freq = 100000; /* 100kHz by default */
- if (!mv64xxx_find_baud_factors(bus_freq, tclk,
- &drv_data->freq_n, &drv_data->freq_m)) {
+ if (of_device_is_compatible(np, "allwinner,sun4i-a10-i2c") ||
+ of_device_is_compatible(np, "allwinner,sun6i-a31-i2c"))
+ drv_data->clk_n_base_0 = true;
+
+ if (!mv64xxx_find_baud_factors(drv_data, bus_freq, tclk)) {
rc = -EINVAL;
goto out;
}
if (slave->flags & I2C_CLIENT_TEN)
return -EAFNOSUPPORT;
- pm_runtime_forbid(rcar_i2c_priv_to_dev(priv));
+ pm_runtime_get_sync(rcar_i2c_priv_to_dev(priv));
priv->slave = slave;
rcar_i2c_write(priv, ICSAR, slave->addr);
priv->slave = NULL;
- pm_runtime_allow(rcar_i2c_priv_to_dev(priv));
+ pm_runtime_put(rcar_i2c_priv_to_dev(priv));
return 0;
}
&i2c->scl_fall_ns))
i2c->scl_fall_ns = 300;
if (of_property_read_u32(pdev->dev.of_node, "i2c-sda-falling-time-ns",
- &i2c->scl_fall_ns))
+ &i2c->sda_fall_ns))
i2c->sda_fall_ns = i2c->scl_fall_ns;
strlcpy(i2c->adap.name, "rk3x-i2c", sizeof(i2c->adap.name));
adap = &i2c_dev->adap;
i2c_set_adapdata(adap, i2c_dev);
- snprintf(adap->name, sizeof(adap->name), "ST I2C(0x%pa)", &res->start);
+ snprintf(adap->name, sizeof(adap->name), "ST I2C(%pa)", &res->start);
adap->owner = THIS_MODULE;
adap->timeout = 2 * HZ;
adap->retries = 0;
return cma_protocol_roce_dev_port(device, port_num);
}
-static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
- const struct net_device *net_dev)
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+ const struct net_device *net_dev,
+ u8 port_num)
{
- const struct rdma_addr *addr = &id_priv->id.route.addr;
+ const struct rdma_addr *addr = &id->route.addr;
if (!net_dev)
/* This request is an AF_IB request or a RoCE request */
- return addr->src_addr.ss_family == AF_IB ||
- cma_protocol_roce(&id_priv->id);
+ return (!id->port_num || id->port_num == port_num) &&
+ (addr->src_addr.ss_family == AF_IB ||
+ cma_protocol_roce_dev_port(id->device, port_num));
return !addr->dev_addr.bound_dev_if ||
(net_eq(dev_net(net_dev), addr->dev_addr.net) &&
hlist_for_each_entry(id_priv, &bind_list->owners, node) {
if (cma_match_private_data(id_priv, ib_event->private_data)) {
if (id_priv->id.device == cm_id->device &&
- cma_match_net_dev(id_priv, net_dev))
+ cma_match_net_dev(&id_priv->id, net_dev, req->port))
return id_priv;
list_for_each_entry(id_priv_dev,
&id_priv->listen_list,
listen_list) {
if (id_priv_dev->id.device == cm_id->device &&
- cma_match_net_dev(id_priv_dev, net_dev))
+ cma_match_net_dev(&id_priv_dev->id, net_dev, req->port))
return id_priv_dev;
}
}
mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
ib_umem_release(msrq->umem);
} else {
- kfree(msrq->wrid);
+ kvfree(msrq->wrid);
mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
&msrq->buf);
mlx4_db_free(dev->dev, &msrq->db);
u16 interface_type;
};
+enum ocrdma_flags {
+ OCRDMA_FLAGS_LINK_STATUS_INIT = 0x01
+};
+
struct ocrdma_dev {
struct ib_device ibdev;
struct ocrdma_dev_attr attr;
atomic_t update_sl;
u16 pvid;
u32 asic_id;
+ u32 flags;
ulong last_stats_time;
struct mutex stats_lock; /* provide synch for debugfs operations */
(state & OCRDMA_STATE_FLAG_SYNC);
}
+static inline u8 ocrdma_get_ae_link_state(u32 ae_state)
+{
+ return ((ae_state & OCRDMA_AE_LSC_LS_MASK) >> OCRDMA_AE_LSC_LS_SHIFT);
+}
+
#endif
cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE);
cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE);
+ /* Request link events on this MQ. */
+ cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_LINK_EVE_CODE);
cmd->async_cqid_ringsize = cq->id;
cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) <<
}
}
+static void ocrdma_process_link_state(struct ocrdma_dev *dev,
+ struct ocrdma_ae_mcqe *cqe)
+{
+ struct ocrdma_ae_lnkst_mcqe *evt;
+ u8 lstate;
+
+ evt = (struct ocrdma_ae_lnkst_mcqe *)cqe;
+ lstate = ocrdma_get_ae_link_state(evt->speed_state_ptn);
+
+ if (!(lstate & OCRDMA_AE_LSC_LLINK_MASK))
+ return;
+
+ if (dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT)
+ ocrdma_update_link_state(dev, (lstate & OCRDMA_LINK_ST_MASK));
+}
+
static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
{
/* async CQE processing */
struct ocrdma_ae_mcqe *cqe = ae_cqe;
u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >>
OCRDMA_AE_MCQE_EVENT_CODE_SHIFT;
-
- if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE)
+ switch (evt_code) {
+ case OCRDMA_ASYNC_LINK_EVE_CODE:
+ ocrdma_process_link_state(dev, cqe);
+ break;
+ case OCRDMA_ASYNC_RDMA_EVE_CODE:
ocrdma_dispatch_ibevent(dev, cqe);
- else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE)
+ break;
+ case OCRDMA_ASYNC_GRP5_EVE_CODE:
ocrdma_process_grp5_aync(dev, cqe);
- else
+ break;
+ default:
pr_err("%s(%d) invalid evt code=0x%x\n", __func__,
dev->id, evt_code);
+ }
}
static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe)
return status;
}
-int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed)
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed,
+ u8 *lnk_state)
{
int status = -ENOMEM;
struct ocrdma_get_link_speed_rsp *rsp;
goto mbx_err;
rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
- *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
- >> OCRDMA_PHY_PS_SHIFT;
+ if (lnk_speed)
+ *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
+ >> OCRDMA_PHY_PS_SHIFT;
+ if (lnk_state)
+ *lnk_state = (rsp->res_lnk_st & OCRDMA_LINK_ST_MASK);
mbx_err:
kfree(cmd);
ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
- if (vlan_id < 0x1000) {
- if (dev->pfc_state) {
- vlan_id = 0;
+ if (vlan_id == 0xFFFF)
+ vlan_id = 0;
+ if (vlan_id || dev->pfc_state) {
+ if (!vlan_id) {
pr_err("ocrdma%d:Using VLAN with PFC is recommended\n",
dev->id);
pr_err("ocrdma%d:Using VLAN 0 for this connection\n",
bool solicited, u16 cqe_popped);
/* verbs specific mailbox commands */
-int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed);
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed,
+ u8 *lnk_st);
int ocrdma_query_config(struct ocrdma_dev *,
struct ocrdma_mbx_query_config *config);
void ocrdma_init_service_level(struct ocrdma_dev *);
void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev);
void ocrdma_free_pd_range(struct ocrdma_dev *dev);
+void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate);
#endif /* __OCRDMA_HW_H__ */
static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
{
int status = 0, i;
+ u8 lstate = 0;
struct ocrdma_dev *dev;
dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
if (status)
goto alloc_err;
+ /* Query Link state and update */
+ status = ocrdma_mbx_get_link_speed(dev, NULL, &lstate);
+ if (!status)
+ ocrdma_update_link_state(dev, lstate);
+
for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i]))
goto sysfs_err;
ocrdma_remove_free(dev);
}
-static int ocrdma_open(struct ocrdma_dev *dev)
+static int ocrdma_dispatch_port_active(struct ocrdma_dev *dev)
{
struct ib_event port_event;
return 0;
}
-static int ocrdma_close(struct ocrdma_dev *dev)
+static int ocrdma_dispatch_port_error(struct ocrdma_dev *dev)
{
- int i;
- struct ocrdma_qp *qp, **cur_qp;
struct ib_event err_event;
- struct ib_qp_attr attrs;
- int attr_mask = IB_QP_STATE;
-
- attrs.qp_state = IB_QPS_ERR;
- mutex_lock(&dev->dev_lock);
- if (dev->qp_tbl) {
- cur_qp = dev->qp_tbl;
- for (i = 0; i < OCRDMA_MAX_QP; i++) {
- qp = cur_qp[i];
- if (qp && qp->ibqp.qp_type != IB_QPT_GSI) {
- /* change the QP state to ERROR */
- _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask);
-
- err_event.event = IB_EVENT_QP_FATAL;
- err_event.element.qp = &qp->ibqp;
- err_event.device = &dev->ibdev;
- ib_dispatch_event(&err_event);
- }
- }
- }
- mutex_unlock(&dev->dev_lock);
err_event.event = IB_EVENT_PORT_ERR;
err_event.element.port_num = 1;
static void ocrdma_shutdown(struct ocrdma_dev *dev)
{
- ocrdma_close(dev);
+ ocrdma_dispatch_port_error(dev);
ocrdma_remove(dev);
}
static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)
{
switch (event) {
- case BE_DEV_UP:
- ocrdma_open(dev);
- break;
- case BE_DEV_DOWN:
- ocrdma_close(dev);
- break;
case BE_DEV_SHUTDOWN:
ocrdma_shutdown(dev);
break;
+ default:
+ break;
}
}
+void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate)
+{
+ if (!(dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT)) {
+ dev->flags |= OCRDMA_FLAGS_LINK_STATUS_INIT;
+ if (!lstate)
+ return;
+ }
+
+ if (!lstate)
+ ocrdma_dispatch_port_error(dev);
+ else
+ ocrdma_dispatch_port_active(dev);
+}
+
static struct ocrdma_driver ocrdma_drv = {
.name = "ocrdma_driver",
.add = ocrdma_add,
u32 valid_ae_event;
};
-#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14
-#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5
+enum ocrdma_async_event_code {
+ OCRDMA_ASYNC_LINK_EVE_CODE = 0x01,
+ OCRDMA_ASYNC_GRP5_EVE_CODE = 0x05,
+ OCRDMA_ASYNC_RDMA_EVE_CODE = 0x14
+};
enum ocrdma_async_grp5_events {
OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01,
OCRDMA_MAX_ASYNC_ERRORS
};
+struct ocrdma_ae_lnkst_mcqe {
+ u32 speed_state_ptn;
+ u32 qos_reason_falut;
+ u32 evt_tag;
+ u32 valid_ae_event;
+};
+
+enum {
+ OCRDMA_AE_LSC_PORT_NUM_MASK = 0x3F,
+ OCRDMA_AE_LSC_PT_SHIFT = 0x06,
+ OCRDMA_AE_LSC_PT_MASK = (0x03 <<
+ OCRDMA_AE_LSC_PT_SHIFT),
+ OCRDMA_AE_LSC_LS_SHIFT = 0x08,
+ OCRDMA_AE_LSC_LS_MASK = (0xFF <<
+ OCRDMA_AE_LSC_LS_SHIFT),
+ OCRDMA_AE_LSC_LD_SHIFT = 0x10,
+ OCRDMA_AE_LSC_LD_MASK = (0xFF <<
+ OCRDMA_AE_LSC_LD_SHIFT),
+ OCRDMA_AE_LSC_PPS_SHIFT = 0x18,
+ OCRDMA_AE_LSC_PPS_MASK = (0xFF <<
+ OCRDMA_AE_LSC_PPS_SHIFT),
+ OCRDMA_AE_LSC_PPF_MASK = 0xFF,
+ OCRDMA_AE_LSC_ER_SHIFT = 0x08,
+ OCRDMA_AE_LSC_ER_MASK = (0xFF <<
+ OCRDMA_AE_LSC_ER_SHIFT),
+ OCRDMA_AE_LSC_QOS_SHIFT = 0x10,
+ OCRDMA_AE_LSC_QOS_MASK = (0xFFFF <<
+ OCRDMA_AE_LSC_QOS_SHIFT)
+};
+
+enum {
+ OCRDMA_AE_LSC_PLINK_DOWN = 0x00,
+ OCRDMA_AE_LSC_PLINK_UP = 0x01,
+ OCRDMA_AE_LSC_LLINK_DOWN = 0x02,
+ OCRDMA_AE_LSC_LLINK_MASK = 0x02,
+ OCRDMA_AE_LSC_LLINK_UP = 0x03
+};
+
/* mailbox command request and responses */
enum {
OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT = 2,
OCRDMA_PHY_PFLT_SHIFT = 0x18,
OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000,
OCRDMA_QOS_LNKSP_SHIFT = 0x10,
- OCRDMA_LLST_MASK = 0xFF,
+ OCRDMA_LINK_ST_MASK = 0x01,
OCRDMA_PLFC_MASK = 0x00000400,
OCRDMA_PLFC_SHIFT = 0x8,
OCRDMA_PLRFC_MASK = 0x00000200,
u32 pflt_pps_ld_pnum;
u32 qos_lsp;
- u32 res_lls;
+ u32 res_lnk_st;
};
enum {
int status;
u8 speed;
- status = ocrdma_mbx_get_link_speed(dev, &speed);
+ status = ocrdma_mbx_get_link_speed(dev, &speed, NULL);
if (status)
speed = OCRDMA_PHYS_LINK_SPEED_ZERO;
return;
}
+ memset(&db9_parport_cb, 0, sizeof(db9_parport_cb));
db9_parport_cb.flags = PARPORT_FLAG_EXCL;
pd = parport_register_dev_model(pp, "db9", &db9_parport_cb, port_idx);
pads = gc_cfg[port_idx].args + 1;
n_pads = gc_cfg[port_idx].nargs - 1;
+ memset(&gc_parport_cb, 0, sizeof(gc_parport_cb));
gc_parport_cb.flags = PARPORT_FLAG_EXCL;
pd = parport_register_dev_model(pp, "gamecon", &gc_parport_cb,
n_buttons = tgfx_cfg[port_idx].args + 1;
n_devs = tgfx_cfg[port_idx].nargs - 1;
+ memset(&tgfx_parport_cb, 0, sizeof(tgfx_parport_cb));
tgfx_parport_cb.flags = PARPORT_FLAG_EXCL;
pd = parport_register_dev_model(pp, "turbografx", &tgfx_parport_cb,
w->parport = pp;
+ memset(&walkera0701_parport_cb, 0, sizeof(walkera0701_parport_cb));
walkera0701_parport_cb.flags = PARPORT_FLAG_EXCL;
walkera0701_parport_cb.irq_func = walkera0701_irq_handler;
walkera0701_parport_cb.private = w;
ret = regmap_update_bits(arizona->regmap,
ARIZONA_HAPTICS_CONTROL_1,
- ARIZONA_HAP_CTRL_MASK,
- 1 << ARIZONA_HAP_CTRL_SHIFT);
+ ARIZONA_HAP_CTRL_MASK, 0);
if (ret != 0) {
dev_err(arizona->dev, "Failed to stop haptics: %d\n",
ret);
#define DRIVER_NAME "elan_i2c"
#define ELAN_DRIVER_VERSION "1.6.1"
+#define ELAN_VENDOR_ID 0x04f3
#define ETP_MAX_PRESSURE 255
#define ETP_FWIDTH_REDUCE 90
#define ETP_FINGER_WIDTH 15
input->name = "Elan Touchpad";
input->id.bustype = BUS_I2C;
+ input->id.vendor = ELAN_VENDOR_ID;
+ input->id.product = data->product_id;
input_set_drvdata(input, data);
error = input_mt_init_slots(input, ETP_MAX_FINGERS,
{
struct pardev_cb parkbd_parport_cb;
+ memset(&parkbd_parport_cb, 0, sizeof(parkbd_parport_cb));
parkbd_parport_cb.irq_func = parkbd_interrupt;
parkbd_parport_cb.flags = PARPORT_FLAG_EXCL;
input_set_abs_params(inputdev, ABS_TILT_Y, AIPTEK_TILT_MIN, AIPTEK_TILT_MAX, 0, 0);
input_set_abs_params(inputdev, ABS_WHEEL, AIPTEK_WHEEL_MIN, AIPTEK_WHEEL_MAX - 1, 0, 0);
+ /* Verify that a device really has an endpoint */
+ if (intf->altsetting[0].desc.bNumEndpoints < 1) {
+ dev_err(&intf->dev,
+ "interface has %d endpoints, but must have minimum 1\n",
+ intf->altsetting[0].desc.bNumEndpoints);
+ err = -EINVAL;
+ goto fail3;
+ }
endpoint = &intf->altsetting[0].endpoint[0].desc;
/* Go set up our URB, which is called when the tablet receives
if (i == ARRAY_SIZE(speeds)) {
dev_info(&intf->dev,
"Aiptek tried all speeds, no sane response\n");
+ err = -EINVAL;
goto fail3;
}
{ }
};
+static unsigned int chromebook_tp_buttons[] = {
+ KEY_RESERVED,
+ KEY_RESERVED,
+ KEY_RESERVED,
+ KEY_RESERVED,
+ KEY_RESERVED,
+ BTN_LEFT
+};
+
+static struct mxt_acpi_platform_data chromebook_platform_data[] = {
+ {
+ /* Touchpad */
+ .hid = "ATML0000",
+ .pdata = {
+ .t19_num_keys = ARRAY_SIZE(chromebook_tp_buttons),
+ .t19_keymap = chromebook_tp_buttons,
+ },
+ },
+ {
+ /* Touchscreen */
+ .hid = "ATML0001",
+ },
+ { }
+};
+
static const struct dmi_system_id mxt_dmi_table[] = {
{
/* 2015 Google Pixel */
},
.driver_data = samus_platform_data,
},
+ {
+ /* Other Google Chromebooks */
+ .ident = "Chromebook",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+ },
+ .driver_data = chromebook_platform_data,
+ },
{ }
};
{ "qt602240_ts", 0 },
{ "atmel_mxt_ts", 0 },
{ "atmel_mxt_tp", 0 },
+ { "maxtouch", 0 },
{ "mXT224", 0 },
{ }
};
disable_irq(client->irq);
- if (device_may_wakeup(dev) || ts->keep_power_in_suspend) {
+ if (device_may_wakeup(dev)) {
+ /*
+ * The device will automatically enter idle mode
+ * that has reduced power consumption.
+ */
+ ts->wake_irq_enabled = (enable_irq_wake(client->irq) == 0);
+ } else if (ts->keep_power_in_suspend) {
for (retry_cnt = 0; retry_cnt < MAX_RETRIES; retry_cnt++) {
error = elants_i2c_send(client, set_sleep_cmd,
sizeof(set_sleep_cmd));
dev_err(&client->dev,
"suspend command failed: %d\n", error);
}
-
- if (device_may_wakeup(dev))
- ts->wake_irq_enabled =
- (enable_irq_wake(client->irq) == 0);
} else {
elants_i2c_power_off(ts);
}
int retry_cnt;
int error;
- if (device_may_wakeup(dev) && ts->wake_irq_enabled)
- disable_irq_wake(client->irq);
-
- if (ts->keep_power_in_suspend) {
+ if (device_may_wakeup(dev)) {
+ if (ts->wake_irq_enabled)
+ disable_irq_wake(client->irq);
+ elants_i2c_sw_reset(client);
+ } else if (ts->keep_power_in_suspend) {
for (retry_cnt = 0; retry_cnt < MAX_RETRIES; retry_cnt++) {
error = elants_i2c_send(client, set_active_cmd,
sizeof(set_active_cmd));
}
}
+static bool access_error(struct vm_area_struct *vma, struct fault *fault)
+{
+ unsigned long requested = 0;
+
+ if (fault->flags & PPR_FAULT_EXEC)
+ requested |= VM_EXEC;
+
+ if (fault->flags & PPR_FAULT_READ)
+ requested |= VM_READ;
+
+ if (fault->flags & PPR_FAULT_WRITE)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+}
+
static void do_fault(struct work_struct *work)
{
struct fault *fault = container_of(work, struct fault, work);
goto out;
}
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) {
- /* handle_mm_fault would BUG_ON() */
+ /* Check if we have the right permissions on the vma */
+ if (access_error(vma, fault)) {
up_read(&mm->mmap_sem);
handle_fault_error(fault);
goto out;
#include <linux/device.h>
#include <linux/dma-iommu.h>
+#include <linux/gfp.h>
#include <linux/huge_mm.h>
#include <linux/iommu.h>
#include <linux/iova.h>
#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/vmalloc.h>
int iommu_dma_init(void)
{
{
struct page **pages;
unsigned int i = 0, array_size = count * sizeof(*pages);
+ unsigned int order = MAX_ORDER;
if (array_size <= PAGE_SIZE)
pages = kzalloc(array_size, GFP_KERNEL);
while (count) {
struct page *page = NULL;
- int j, order = __fls(count);
+ int j;
/*
* Higher-order allocations are a convenience rather
* than a necessity, hence using __GFP_NORETRY until
* falling back to single-page allocations.
*/
- for (order = min(order, MAX_ORDER); order > 0; order--) {
+ for (order = min_t(unsigned int, order, __fls(count));
+ order > 0; order--) {
page = alloc_pages(gfp | __GFP_NORETRY, order);
if (!page)
continue;
size_t s_offset = iova_offset(iovad, s->offset);
size_t s_length = s->length;
- sg_dma_address(s) = s->offset;
+ sg_dma_address(s) = s_offset;
sg_dma_len(s) = s_length;
s->offset -= s_offset;
s_length = iova_align(iovad, s_length + s_offset);
sg_res = aligned_nrpages(sg->offset, sg->length);
sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
sg->dma_length = sg->length;
- pteval = (sg_phys(sg) & PAGE_MASK) | prot;
+ pteval = page_to_phys(sg_page(sg)) | prot;
phys_pfn = pteval >> VTD_PAGE_SHIFT;
}
for_each_sg(sglist, sg, nelems, i) {
BUG_ON(!sg_page(sg));
- sg->dma_address = sg_phys(sg);
+ sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
sg->dma_length = sg->length;
}
return nelems;
};
#define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
+
+static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req)
+{
+ unsigned long requested = 0;
+
+ if (req->exe_req)
+ requested |= VM_EXEC;
+
+ if (req->rd_req)
+ requested |= VM_READ;
+
+ if (req->wr_req)
+ requested |= VM_WRITE;
+
+ return (requested & ~vma->vm_flags) != 0;
+}
+
static irqreturn_t prq_event_thread(int irq, void *d)
{
struct intel_iommu *iommu = d;
if (!vma || address < vma->vm_start)
goto invalid;
+ if (access_error(vma, req))
+ goto invalid;
+
ret = handle_mm_fault(svm->mm, vma, address,
req->wr_req ? FAULT_FLAG_WRITE : 0);
if (ret & VM_FAULT_ERROR)
min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap);
for_each_sg(sg, s, nents, i) {
- phys_addr_t phys = sg_phys(s);
+ phys_addr_t phys = page_to_phys(sg_page(s)) + s->offset;
/*
* We are mapping on IOMMU page boundaries, so offset within
static int ipmmu_domain_init_context(struct ipmmu_vmsa_domain *domain)
{
- phys_addr_t ttbr;
+ u64 ttbr;
/*
* Allocate the page table operations.
struct sk_buff *skb = bcs->tx_skb;
int sent = -EOPNOTSUPP;
- if (!tty || !tty->driver || !skb)
- return -EINVAL;
+ WARN_ON(!tty || !tty->ops || !skb);
if (!skb->len) {
dev_kfree_skb_any(skb);
unsigned long flags;
int sent = 0;
- if (!tty || !tty->driver)
- return -EFAULT;
+ WARN_ON(!tty || !tty->ops);
cb = cs->cmdbuf;
if (!cb)
tasklet_kill(&cs->write_tasklet);
if (!cs->hw.ser)
return;
- dev_set_drvdata(&cs->hw.ser->dev.dev, NULL);
platform_device_unregister(&cs->hw.ser->dev);
- kfree(cs->hw.ser);
- cs->hw.ser = NULL;
}
static void gigaset_device_release(struct device *dev)
{
- struct platform_device *pdev = to_platform_device(dev);
+ struct cardstate *cs = dev_get_drvdata(dev);
- /* adapted from platform_device_release() in drivers/base/platform.c */
- kfree(dev->platform_data);
- kfree(pdev->resource);
+ if (!cs)
+ return;
+ dev_set_drvdata(dev, NULL);
+ kfree(cs->hw.ser);
+ cs->hw.ser = NULL;
}
/*
struct tty_struct *tty = cs->hw.ser->tty;
unsigned int set, clear;
- if (!tty || !tty->driver || !tty->ops->tiocmset)
+ WARN_ON(!tty || !tty->ops);
+ /* tiocmset is an optional tty driver method */
+ if (!tty->ops->tiocmset)
return -EINVAL;
set = new_state & ~old_state;
clear = old_state & ~new_state;
if (ipac->type & IPAC_TYPE_IPACX) {
ista = ReadIPAC(ipac, ISACX_ISTA);
- while (ista && cnt--) {
+ while (ista && --cnt) {
pr_debug("%s: ISTA %02x\n", ipac->name, ista);
if (ista & IPACX__ICA)
ipac_irq(&ipac->hscx[0], ista);
}
} else if (ipac->type & IPAC_TYPE_IPAC) {
ista = ReadIPAC(ipac, IPAC_ISTA);
- while (ista && cnt--) {
+ while (ista && --cnt) {
pr_debug("%s: ISTA %02x\n", ipac->name, ista);
if (ista & (IPAC__ICD | IPAC__EXD)) {
istad = ReadISAC(isac, ISAC_ISTA);
ista = ReadIPAC(ipac, IPAC_ISTA);
}
} else if (ipac->type & IPAC_TYPE_HSCX) {
- while (cnt) {
+ while (--cnt) {
ista = ReadIPAC(ipac, IPAC_ISTAB + ipac->hscx[1].off);
pr_debug("%s: B2 ISTA %02x\n", ipac->name, ista);
if (ista)
mISDNisac_irq(isac, istad);
if (0 == (ista | istad))
break;
- cnt--;
}
}
if (cnt > maxloop) /* only for ISAC/HSCX without PCI IRQ test */
struct nvm_block *blk;
int i;
- lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun];
+ lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun];
for (i = 0; i < nr_blocks; i++) {
if (blks[i] == 0)
*/
void mddev_suspend(struct mddev *mddev)
{
- BUG_ON(mddev->suspended);
- mddev->suspended = 1;
+ if (mddev->suspended++)
+ return;
synchronize_rcu();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
void mddev_resume(struct mddev *mddev)
{
- mddev->suspended = 0;
+ if (--mddev->suspended)
+ return;
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
- rdev->raid_disk = mddev->raid_disks;
+ rdev->raid_disk = 0;
break;
default:
rdev->saved_raid_disk = role;
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
+ int err;
if (rdev->raid_disk != -1)
return -EBUSY;
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
- remove_and_add_spares(rdev->mddev, rdev);
- if (rdev->raid_disk == -1)
- return -EBUSY;
+ err = rdev->mddev->pers->
+ hot_add_disk(rdev->mddev, rdev);
+ if (err) {
+ rdev->raid_disk = -1;
+ return err;
+ } else
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+ if (sysfs_link_rdev(rdev->mddev, rdev))
+ /* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
}
mddev_unlock(mddev);
}
- } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- err = mddev->pers->start_reshape(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ err = -EBUSY;
+ else {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ err = mddev->pers->start_reshape(mddev);
+ }
mddev_unlock(mddev);
}
if (err)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
+ if (!test_bit(Replacement, &rdev->flags) &&
+ !test_bit(Journal, &rdev->flags) &&
+ mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
first = i;
fbio = r10_bio->devs[i].bio;
+ fbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ fbio->bi_iter.bi_idx = 0;
vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
/* now find blocks with errors */
bio_reset(tbio);
tbio->bi_vcnt = vcnt;
- tbio->bi_iter.bi_size = r10_bio->sectors << 9;
+ tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
{
int i;
- for (i = 0; i < IVTV_CARD_MAX_VIDEO_INPUTS - 1; i++)
+ for (i = 0; i < IVTV_CARD_MAX_VIDEO_INPUTS; i++)
if (itv->card->video_inputs[i].video_type == 0)
break;
itv->nof_inputs = i;
- for (i = 0; i < IVTV_CARD_MAX_AUDIO_INPUTS - 1; i++)
+ for (i = 0; i < IVTV_CARD_MAX_AUDIO_INPUTS; i++)
if (itv->card->audio_inputs[i].audio_type == 0)
break;
itv->nof_audio_inputs = i;
int urbs_submitted;
/* USB control message buffer */
- #define BUF_SIZE 24
+ #define BUF_SIZE 128
u8 buf[BUF_SIZE];
/* Current configuration */
#include <media/videobuf2-v4l2.h>
#include <media/videobuf2-vmalloc.h>
+/*
+ * Used Avago MGA-81563 RF amplifier could be destroyed pretty easily with too
+ * strong signal or transmitting to bad antenna.
+ * Set RF gain control to 'grabbed' state by default for sure.
+ */
+static bool hackrf_enable_rf_gain_ctrl;
+module_param_named(enable_rf_gain_ctrl, hackrf_enable_rf_gain_ctrl, bool, 0644);
+MODULE_PARM_DESC(enable_rf_gain_ctrl, "enable RX/TX RF amplifier control (warn: could damage amplifier)");
+
/* HackRF USB API commands (from HackRF Library) */
enum {
CMD_SET_TRANSCEIVER_MODE = 0x01,
dev_err(dev->dev, "Could not initialize controls\n");
goto err_v4l2_ctrl_handler_free_rx;
}
+ v4l2_ctrl_grab(dev->rx_rf_gain, !hackrf_enable_rf_gain_ctrl);
v4l2_ctrl_handler_setup(&dev->rx_ctrl_handler);
/* Register controls for transmitter */
dev_err(dev->dev, "Could not initialize controls\n");
goto err_v4l2_ctrl_handler_free_tx;
}
+ v4l2_ctrl_grab(dev->tx_rf_gain, !hackrf_enable_rf_gain_ctrl);
v4l2_ctrl_handler_setup(&dev->tx_ctrl_handler);
/* Register the v4l2_device structure */
err_kfree:
kfree(dev);
err:
- dev_dbg(dev->dev, "failed=%d\n", ret);
+ dev_dbg(&intf->dev, "failed=%d\n", ret);
return ret;
}
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/compiler.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/slab.h>
mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1;
mtd->writesize_mask = (1 << mtd->writesize_shift) - 1;
- if (mtd->dev.parent) {
- if (!mtd->owner && mtd->dev.parent->driver)
- mtd->owner = mtd->dev.parent->driver->owner;
- if (!mtd->name)
- mtd->name = dev_name(mtd->dev.parent);
- } else {
- pr_debug("mtd device won't show a device symlink in sysfs\n");
- }
-
/* Some chips always power up locked. Unlock them now */
if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) {
error = mtd_unlock(mtd, 0, mtd->size);
return 0;
}
+/*
+ * Set a few defaults based on the parent devices, if not provided by the
+ * driver
+ */
+static void mtd_set_dev_defaults(struct mtd_info *mtd)
+{
+ if (mtd->dev.parent) {
+ if (!mtd->owner && mtd->dev.parent->driver)
+ mtd->owner = mtd->dev.parent->driver->owner;
+ if (!mtd->name)
+ mtd->name = dev_name(mtd->dev.parent);
+ } else {
+ pr_debug("mtd device won't show a device symlink in sysfs\n");
+ }
+}
/**
* mtd_device_parse_register - parse partitions and register an MTD device.
int ret;
struct mtd_partition *real_parts = NULL;
+ mtd_set_dev_defaults(mtd);
+
ret = parse_mtd_partitions(mtd, types, &real_parts, parser_data);
if (ret <= 0 && nr_parts && parts) {
real_parts = kmemdup(parts, sizeof(*parts) * nr_parts,
ofpart_node = of_get_child_by_name(mtd_node, "partitions");
if (!ofpart_node) {
- pr_warn("%s: 'partitions' subnode not found on %s. Trying to parse direct subnodes as partitions.\n",
- master->name, mtd_node->full_name);
+ /*
+ * We might get here even when ofpart isn't used at all (e.g.,
+ * when using another parser), so don't be louder than
+ * KERN_DEBUG
+ */
+ pr_debug("%s: 'partitions' subnode not found on %s. Trying to parse direct subnodes as partitions.\n",
+ master->name, mtd_node->full_name);
ofpart_node = mtd_node;
dedicated = false;
+ } else if (!of_device_is_compatible(ofpart_node, "fixed-partitions")) {
+ /* The 'partitions' subnode might be used by another parser */
+ return 0;
}
/* First count the subnodes */
status_old = read_sr(nor);
/* Cannot unlock; would unlock larger region than requested */
- if (stm_is_locked_sr(nor, status_old, ofs - mtd->erasesize,
- mtd->erasesize))
+ if (stm_is_locked_sr(nor, ofs - mtd->erasesize, mtd->erasesize,
+ status_old))
return -EINVAL;
/*
if (JEDEC_MFR(info) == SNOR_MFR_ATMEL ||
JEDEC_MFR(info) == SNOR_MFR_INTEL ||
- JEDEC_MFR(info) == SNOR_MFR_SST ||
- JEDEC_MFR(info) == SNOR_MFR_WINBOND) {
+ JEDEC_MFR(info) == SNOR_MFR_SST) {
write_enable(nor);
write_sr(nor, 0);
}
mtd->_read = spi_nor_read;
/* NOR protection support for STmicro/Micron chips and similar */
- if (JEDEC_MFR(info) == SNOR_MFR_MICRON ||
- JEDEC_MFR(info) == SNOR_MFR_WINBOND) {
+ if (JEDEC_MFR(info) == SNOR_MFR_MICRON) {
nor->flash_lock = stm_lock;
nor->flash_unlock = stm_unlock;
nor->flash_is_locked = stm_is_locked;
dfs_rootdir = debugfs_create_dir("ubi", NULL);
if (IS_ERR_OR_NULL(dfs_rootdir)) {
- int err = dfs_rootdir ? -ENODEV : PTR_ERR(dfs_rootdir);
+ int err = dfs_rootdir ? PTR_ERR(dfs_rootdir) : -ENODEV;
pr_err("UBI error: cannot create \"ubi\" debugfs directory, error %d\n",
err);
if (err && err != UBI_IO_BITFLIPS && !mtd_is_eccerr(err))
goto exit;
- crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC);
+ crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
hdr_crc = be32_to_cpu(vid_hdr->hdr_crc);
if (hdr_crc != crc) {
ubi_err(ubi, "bad VID header CRC at PEB %d, calculated %#08x, read %#08x",
return 0;
}
+static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk);
/**
* do_sync_erase - run the erase worker synchronously.
* @ubi: UBI device description object
static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
int vol_id, int lnum, int torture)
{
- struct ubi_work *wl_wrk;
+ struct ubi_work wl_wrk;
dbg_wl("sync erase of PEB %i", e->pnum);
- wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);
- if (!wl_wrk)
- return -ENOMEM;
-
- wl_wrk->e = e;
- wl_wrk->vol_id = vol_id;
- wl_wrk->lnum = lnum;
- wl_wrk->torture = torture;
+ wl_wrk.e = e;
+ wl_wrk.vol_id = vol_id;
+ wl_wrk.lnum = lnum;
+ wl_wrk.torture = torture;
- return erase_worker(ubi, wl_wrk, 0);
+ return __erase_worker(ubi, &wl_wrk);
}
/**
}
/**
- * erase_worker - physical eraseblock erase worker function.
+ * __erase_worker - physical eraseblock erase worker function.
* @ubi: UBI device description object
* @wl_wrk: the work object
* @shutdown: non-zero if the worker has to free memory and exit
* needed. Returns zero in case of success and a negative error code in case of
* failure.
*/
-static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
- int shutdown)
+static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk)
{
struct ubi_wl_entry *e = wl_wrk->e;
int pnum = e->pnum;
int lnum = wl_wrk->lnum;
int err, available_consumed = 0;
- if (shutdown) {
- dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec);
- kfree(wl_wrk);
- wl_entry_destroy(ubi, e);
- return 0;
- }
-
dbg_wl("erase PEB %d EC %d LEB %d:%d",
pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum);
err = sync_erase(ubi, e, wl_wrk->torture);
if (!err) {
- /* Fine, we've erased it successfully */
- kfree(wl_wrk);
-
spin_lock(&ubi->wl_lock);
wl_tree_add(e, &ubi->free);
ubi->free_count++;
}
ubi_err(ubi, "failed to erase PEB %d, error %d", pnum, err);
- kfree(wl_wrk);
if (err == -EINTR || err == -ENOMEM || err == -EAGAIN ||
err == -EBUSY) {
/* Re-schedule the LEB for erasure */
err1 = schedule_erase(ubi, e, vol_id, lnum, 0);
if (err1) {
+ wl_entry_destroy(ubi, e);
err = err1;
goto out_ro;
}
return err;
}
+static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
+ int shutdown)
+{
+ int ret;
+
+ if (shutdown) {
+ struct ubi_wl_entry *e = wl_wrk->e;
+
+ dbg_wl("cancel erasure of PEB %d EC %d", e->pnum, e->ec);
+ kfree(wl_wrk);
+ wl_entry_destroy(ubi, e);
+ return 0;
+ }
+
+ ret = __erase_worker(ubi, wl_wrk);
+ kfree(wl_wrk);
+ return ret;
+}
+
/**
* ubi_wl_put_peb - return a PEB to the wear-leveling sub-system.
* @ubi: UBI device description object
usleep_range(10, 15);
/* Poll Until Poll Condition */
- while (count-- && XGMAC_IOREAD_BITS(pdata, DMA_MR, SWR))
+ while (--count && XGMAC_IOREAD_BITS(pdata, DMA_MR, SWR))
usleep_range(500, 600);
if (!count)
/* Poll Until Poll Condition */
for (i = 0; i < pdata->tx_q_count; i++) {
count = 2000;
- while (count-- && XGMAC_MTL_IOREAD_BITS(pdata, i,
+ while (--count && XGMAC_MTL_IOREAD_BITS(pdata, i,
MTL_Q_TQOMR, FTQ))
usleep_range(500, 600);
struct sk_buff *skb)
{
struct device *dev = ndev_to_dev(tx_ring->ndev);
+ struct xgene_enet_pdata *pdata = netdev_priv(tx_ring->ndev);
struct xgene_enet_raw_desc *raw_desc;
__le64 *exp_desc = NULL, *exp_bufs = NULL;
dma_addr_t dma_addr, pbuf_addr, *frag_dma_addr;
raw_desc->m0 = cpu_to_le64(SET_VAL(LL, ll) | SET_VAL(NV, nv) |
SET_VAL(USERINFO, tx_ring->tail));
tx_ring->cp_ring->cp_skb[tx_ring->tail] = skb;
+ pdata->tx_level += count;
tx_ring->tail = tail;
return count;
{
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct xgene_enet_desc_ring *tx_ring = pdata->tx_ring;
- struct xgene_enet_desc_ring *cp_ring = tx_ring->cp_ring;
- u32 tx_level, cq_level;
+ u32 tx_level = pdata->tx_level;
int count;
- tx_level = pdata->ring_ops->len(tx_ring);
- cq_level = pdata->ring_ops->len(cp_ring);
- if (unlikely(tx_level > pdata->tx_qcnt_hi ||
- cq_level > pdata->cp_qcnt_hi)) {
+ if (tx_level < pdata->txc_level)
+ tx_level += ((typeof(pdata->tx_level))~0U);
+
+ if ((tx_level - pdata->txc_level) > pdata->tx_qcnt_hi) {
netif_stop_queue(ndev);
return NETDEV_TX_BUSY;
}
struct xgene_enet_raw_desc *raw_desc, *exp_desc;
u16 head = ring->head;
u16 slots = ring->slots - 1;
- int ret, count = 0, processed = 0;
+ int ret, desc_count, count = 0, processed = 0;
+ bool is_completion;
do {
raw_desc = &ring->raw_desc[head];
+ desc_count = 0;
+ is_completion = false;
exp_desc = NULL;
if (unlikely(xgene_enet_is_desc_slot_empty(raw_desc)))
break;
}
dma_rmb();
count++;
+ desc_count++;
}
- if (is_rx_desc(raw_desc))
+ if (is_rx_desc(raw_desc)) {
ret = xgene_enet_rx_frame(ring, raw_desc);
- else
+ } else {
ret = xgene_enet_tx_completion(ring, raw_desc);
+ is_completion = true;
+ }
xgene_enet_mark_desc_slot_empty(raw_desc);
if (exp_desc)
xgene_enet_mark_desc_slot_empty(exp_desc);
head = (head + 1) & slots;
count++;
+ desc_count++;
processed++;
+ if (is_completion)
+ pdata->txc_level += desc_count;
if (ret)
break;
pdata->ring_ops->wr_cmd(ring, -count);
ring->head = head;
- if (netif_queue_stopped(ring->ndev)) {
- if (pdata->ring_ops->len(ring) < pdata->cp_qcnt_low)
- netif_wake_queue(ring->ndev);
- }
+ if (netif_queue_stopped(ring->ndev))
+ netif_start_queue(ring->ndev);
}
return processed;
pdata->tx_ring->cp_ring = cp_ring;
pdata->tx_ring->dst_ring_num = xgene_enet_dst_ring_num(cp_ring);
- pdata->tx_qcnt_hi = pdata->tx_ring->slots / 2;
- pdata->cp_qcnt_hi = pdata->rx_ring->slots / 2;
- pdata->cp_qcnt_low = pdata->cp_qcnt_hi / 2;
+ pdata->tx_qcnt_hi = pdata->tx_ring->slots - 128;
return 0;
enum xgene_enet_id enet_id;
struct xgene_enet_desc_ring *tx_ring;
struct xgene_enet_desc_ring *rx_ring;
+ u16 tx_level;
+ u16 txc_level;
char *dev_name;
u32 rx_buff_cnt;
u32 tx_qcnt_hi;
- u32 cp_qcnt_hi;
- u32 cp_qcnt_low;
u32 rx_irq;
u32 txc_irq;
u8 cq_cnt;
sizeof(struct atl1c_recv_ret_status) * rx_desc_count +
8 * 4;
- ring_header->desc = pci_alloc_consistent(pdev, ring_header->size,
- &ring_header->dma);
+ ring_header->desc = dma_zalloc_coherent(&pdev->dev, ring_header->size,
+ &ring_header->dma, GFP_KERNEL);
if (unlikely(!ring_header->desc)) {
- dev_err(&pdev->dev, "pci_alloc_consistend failed\n");
+ dev_err(&pdev->dev, "could not get memory for DMA buffer\n");
goto err_nomem;
}
- memset(ring_header->desc, 0, ring_header->size);
/* init TPD ring */
tpd_ring[0].dma = roundup(ring_header->dma, 8);
config AURORA_NB8800
tristate "Aurora AU-NB8800 support"
+ depends on HAS_DMA
select PHYLIB
help
Support for the AU-NB8800 gigabit Ethernet controller.
return rc;
}
-#if (MAX_SKB_FRAGS >= MAX_FETCH_BD - 3)
+/* VXLAN: 4 = 1 (for linear data BD) + 3 (2 for PBD and last BD) */
+#define BNX2X_NUM_VXLAN_TSO_WIN_SUB_BDS 4
+
+/* Regular: 3 = 1 (for linear data BD) + 2 (for PBD and last BD) */
+#define BNX2X_NUM_TSO_WIN_SUB_BDS 3
+
+#if (MAX_SKB_FRAGS >= MAX_FETCH_BD - BDS_PER_TX_PKT)
/* check if packet requires linearization (packet is too fragmented)
no need to check fragmentation if page size > 8K (there will be no
violation to FW restrictions) */
static int bnx2x_pkt_req_lin(struct bnx2x *bp, struct sk_buff *skb,
u32 xmit_type)
{
- int to_copy = 0;
- int hlen = 0;
- int first_bd_sz = 0;
+ int first_bd_sz = 0, num_tso_win_sub = BNX2X_NUM_TSO_WIN_SUB_BDS;
+ int to_copy = 0, hlen = 0;
- /* 3 = 1 (for linear data BD) + 2 (for PBD and last BD) */
- if (skb_shinfo(skb)->nr_frags >= (MAX_FETCH_BD - 3)) {
+ if (xmit_type & XMIT_GSO_ENC)
+ num_tso_win_sub = BNX2X_NUM_VXLAN_TSO_WIN_SUB_BDS;
+ if (skb_shinfo(skb)->nr_frags >= (MAX_FETCH_BD - num_tso_win_sub)) {
if (xmit_type & XMIT_GSO) {
unsigned short lso_mss = skb_shinfo(skb)->gso_size;
- /* Check if LSO packet needs to be copied:
- 3 = 1 (for headers BD) + 2 (for PBD and last BD) */
- int wnd_size = MAX_FETCH_BD - 3;
+ int wnd_size = MAX_FETCH_BD - num_tso_win_sub;
/* Number of windows to check */
int num_wnds = skb_shinfo(skb)->nr_frags - wnd_size;
int wnd_idx = 0;
req.ver_upd = DRV_VER_UPD;
if (BNXT_PF(bp)) {
- unsigned long vf_req_snif_bmap[4];
+ DECLARE_BITMAP(vf_req_snif_bmap, 256);
u32 *data = (u32 *)vf_req_snif_bmap;
- memset(vf_req_snif_bmap, 0, 32);
+ memset(vf_req_snif_bmap, 0, sizeof(vf_req_snif_bmap));
for (i = 0; i < ARRAY_SIZE(bnxt_vf_req_snif); i++)
__set_bit(bnxt_vf_req_snif[i], vf_req_snif_bmap);
- for (i = 0; i < 8; i++) {
- req.vf_req_fwd[i] = cpu_to_le32(*data);
- data++;
- }
+ for (i = 0; i < 8; i++)
+ req.vf_req_fwd[i] = cpu_to_le32(data[i]);
+
req.enables |=
cpu_to_le32(FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD);
}
bp->nge_port_cnt = 1;
}
- bp->state = BNXT_STATE_OPEN;
+ set_bit(BNXT_STATE_OPEN, &bp->state);
bnxt_enable_int(bp);
/* Enable TX queues */
bnxt_tx_enable(bp);
/* Change device state to avoid TX queue wake up's */
bnxt_tx_disable(bp);
- bp->state = BNXT_STATE_CLOSED;
- cancel_work_sync(&bp->sp_task);
+ clear_bit(BNXT_STATE_OPEN, &bp->state);
+ smp_mb__after_atomic();
+ while (test_bit(BNXT_STATE_IN_SP_TASK, &bp->state))
+ msleep(20);
/* Flush rings before disabling interrupts */
bnxt_shutdown_nic(bp, irq_re_init);
static void bnxt_reset_task(struct bnxt *bp)
{
bnxt_dbg_dump_states(bp);
- if (netif_running(bp->dev))
- bnxt_tx_disable(bp); /* prevent tx timout again */
+ if (netif_running(bp->dev)) {
+ bnxt_close_nic(bp, false, false);
+ bnxt_open_nic(bp, false, false);
+ }
}
static void bnxt_tx_timeout(struct net_device *dev)
struct bnxt *bp = container_of(work, struct bnxt, sp_task);
int rc;
- if (bp->state != BNXT_STATE_OPEN)
+ set_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+ smp_mb__after_atomic();
+ if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
+ clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
return;
+ }
if (test_and_clear_bit(BNXT_RX_MASK_SP_EVENT, &bp->sp_event))
bnxt_cfg_rx_mode(bp);
bnxt_hwrm_tunnel_dst_port_free(
bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN);
}
- if (test_and_clear_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event))
+ if (test_and_clear_bit(BNXT_RESET_TASK_SP_EVENT, &bp->sp_event)) {
+ /* bnxt_reset_task() calls bnxt_close_nic() which waits
+ * for BNXT_STATE_IN_SP_TASK to clear.
+ */
+ clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+ rtnl_lock();
bnxt_reset_task(bp);
+ set_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
+ rtnl_unlock();
+ }
+
+ smp_mb__before_atomic();
+ clear_bit(BNXT_STATE_IN_SP_TASK, &bp->state);
}
static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
bp->timer.function = bnxt_timer;
bp->current_interval = BNXT_TIMER_INTERVAL;
- bp->state = BNXT_STATE_CLOSED;
+ clear_bit(BNXT_STATE_OPEN, &bp->state);
return 0;
struct timer_list timer;
- int state;
-#define BNXT_STATE_CLOSED 0
-#define BNXT_STATE_OPEN 1
+ unsigned long state;
+#define BNXT_STATE_OPEN 0
+#define BNXT_STATE_IN_SP_TASK 1
struct bnxt_irq *irq_tbl;
u8 mac_addr[ETH_ALEN];
#ifdef CONFIG_BNXT_SRIOV
static int bnxt_vf_ndo_prep(struct bnxt *bp, int vf_id)
{
- if (bp->state != BNXT_STATE_OPEN) {
+ if (!test_bit(BNXT_STATE_OPEN, &bp->state)) {
netdev_err(bp->dev, "vf ndo called though PF is down\n");
return -EINVAL;
}
#define NIC_GET_BGX_FROM_VF_LMAC_MAP(map) ((map >> 4) & 0xF)
#define NIC_GET_LMAC_FROM_VF_LMAC_MAP(map) (map & 0xF)
u8 vf_lmac_map[MAX_LMAC];
- u8 lmac_cnt;
struct delayed_work dwork;
struct workqueue_struct *check_link;
u8 link[MAX_LMAC];
u64 lmac_credit;
nic->num_vf_en = 0;
- nic->lmac_cnt = 0;
for (bgx = 0; bgx < NIC_MAX_BGX; bgx++) {
if (!(bgx_map & (1 << bgx)))
nic->vf_lmac_map[next_bgx_lmac++] =
NIC_SET_VF_LMAC_MAP(bgx, lmac);
nic->num_vf_en += lmac_cnt;
- nic->lmac_cnt += lmac_cnt;
/* Program LMAC credits */
lmac_credit = (1ull << 1); /* channel credit enable */
return 0;
}
+static void nic_enable_vf(struct nicpf *nic, int vf, bool enable)
+{
+ int bgx, lmac;
+
+ nic->vf_enabled[vf] = enable;
+
+ if (vf >= nic->num_vf_en)
+ return;
+
+ bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
+ lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
+
+ bgx_lmac_rx_tx_enable(nic->node, bgx, lmac, enable);
+}
+
/* Interrupt handler to handle mailbox messages from VFs */
static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
{
break;
case NIC_MBOX_MSG_CFG_DONE:
/* Last message of VF config msg sequence */
- nic->vf_enabled[vf] = true;
- if (vf >= nic->lmac_cnt)
- goto unlock;
-
- bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
- lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
-
- bgx_lmac_rx_tx_enable(nic->node, bgx, lmac, true);
+ nic_enable_vf(nic, vf, true);
goto unlock;
case NIC_MBOX_MSG_SHUTDOWN:
/* First msg in VF teardown sequence */
- nic->vf_enabled[vf] = false;
if (vf >= nic->num_vf_en)
nic->sqs_used[vf - nic->num_vf_en] = false;
nic->pqs_vf[vf] = 0;
-
- if (vf >= nic->lmac_cnt)
- break;
-
- bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
- lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
-
- bgx_lmac_rx_tx_enable(nic->node, bgx, lmac, false);
+ nic_enable_vf(nic, vf, false);
break;
case NIC_MBOX_MSG_ALLOC_SQS:
nic_alloc_sqs(nic, &mbx.sqs_alloc);
mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE;
- for (vf = 0; vf < nic->lmac_cnt; vf++) {
+ for (vf = 0; vf < nic->num_vf_en; vf++) {
/* Poll only if VF is UP */
if (!nic->vf_enabled[vf])
continue;
INIT_LIST_HEAD(&ctbl->hash_list[i]);
cl_list = t4_alloc_mem(clipt_size*sizeof(struct clip_entry));
+ if (!cl_list) {
+ t4_free_mem(ctbl);
+ return NULL;
+ }
ctbl->cl_list = (void *)cl_list;
for (i = 0; i < clipt_size; i++) {
/*
* internal function to open-close roce device during ifup-ifdown.
*/
-void be_roce_dev_open(struct be_adapter *);
-void be_roce_dev_close(struct be_adapter *);
void be_roce_dev_shutdown(struct be_adapter *);
#endif /* BE_H */
return 0;
err_msix:
- for (i--, eqo = &adapter->eq_obj[i]; i >= 0; i--, eqo--)
+ for (i--; i >= 0; i--) {
+ eqo = &adapter->eq_obj[i];
free_irq(be_msix_vec_get(adapter, eqo), eqo);
+ }
dev_warn(&adapter->pdev->dev, "MSIX Request IRQ failed - err %d\n",
status);
be_msix_disable(adapter);
be_disable_if_filters(adapter);
- be_roce_dev_close(adapter);
-
if (adapter->flags & BE_FLAGS_NAPI_ENABLED) {
for_all_evt_queues(adapter, eqo, i) {
napi_disable(&eqo->napi);
be_link_status_update(adapter, link_status);
netif_tx_start_all_queues(netdev);
- be_roce_dev_open(adapter);
-
#ifdef CONFIG_BE2NET_VXLAN
if (skyhawk_chip(adapter))
vxlan_get_rx_port(netdev);
}
}
-static void _be_roce_dev_open(struct be_adapter *adapter)
-{
- if (ocrdma_drv && adapter->ocrdma_dev &&
- ocrdma_drv->state_change_handler)
- ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
- BE_DEV_UP);
-}
-
-void be_roce_dev_open(struct be_adapter *adapter)
-{
- if (be_roce_supported(adapter)) {
- mutex_lock(&be_adapter_list_lock);
- _be_roce_dev_open(adapter);
- mutex_unlock(&be_adapter_list_lock);
- }
-}
-
-static void _be_roce_dev_close(struct be_adapter *adapter)
-{
- if (ocrdma_drv && adapter->ocrdma_dev &&
- ocrdma_drv->state_change_handler)
- ocrdma_drv->state_change_handler(adapter->ocrdma_dev,
- BE_DEV_DOWN);
-}
-
-void be_roce_dev_close(struct be_adapter *adapter)
-{
- if (be_roce_supported(adapter)) {
- mutex_lock(&be_adapter_list_lock);
- _be_roce_dev_close(adapter);
- mutex_unlock(&be_adapter_list_lock);
- }
-}
-
void be_roce_dev_shutdown(struct be_adapter *adapter)
{
if (be_roce_supported(adapter)) {
_be_roce_dev_add(dev);
netdev = dev->netdev;
- if (netif_running(netdev) && netif_oper_up(netdev))
- _be_roce_dev_open(dev);
}
mutex_unlock(&be_adapter_list_lock);
return 0;
void (*state_change_handler) (struct ocrdma_dev *, u32 new_state);
};
-enum {
- BE_DEV_UP = 0,
- BE_DEV_DOWN = 1,
+enum be_roce_event {
BE_DEV_SHUTDOWN = 2
};
*reg = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
else { /* !dst_is_aligned */
for (i = 0; i < len; i++, reg++) {
- u32 buf =
- nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
-
- /* to accommodate word-unaligned address of "reg"
- * we have to do memcpy_toio() instead of simple "=".
- */
- memcpy_toio((void __iomem *)reg, &buf, sizeof(buf));
+ u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
+ put_unaligned(buf, reg);
}
}
/* copy last bytes (if any) */
if (last) {
u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
-
- memcpy_toio((void __iomem *)reg, &buf, last);
+ memcpy((u8*)reg, &buf, last);
}
}
struct nps_enet_tx_ctl tx_ctrl;
short length = skb->len;
u32 i, len = DIV_ROUND_UP(length, sizeof(u32));
- u32 *src = (u32 *)virt_to_phys(skb->data);
+ u32 *src = (void *)skb->data;
bool src_is_aligned = IS_ALIGNED((unsigned long)src, sizeof(u32));
tx_ctrl.value = 0;
if (src_is_aligned)
for (i = 0; i < len; i++, src++)
nps_enet_reg_set(priv, NPS_ENET_REG_TX_BUF, *src);
- else { /* !src_is_aligned */
- for (i = 0; i < len; i++, src++) {
- u32 buf;
-
- /* to accommodate word-unaligned address of "src"
- * we have to do memcpy_fromio() instead of simple "="
- */
- memcpy_fromio(&buf, (void __iomem *)src, sizeof(buf));
- nps_enet_reg_set(priv, NPS_ENET_REG_TX_BUF, buf);
- }
- }
+ else /* !src_is_aligned */
+ for (i = 0; i < len; i++, src++)
+ nps_enet_reg_set(priv, NPS_ENET_REG_TX_BUF,
+ get_unaligned(src));
+
/* Write the length of the Frame */
tx_ctrl.nt = length;
cbd_t __iomem *prev_bd;
cbd_t __iomem *last_tx_bd;
- last_tx_bd = fep->tx_bd_base + (fpi->tx_ring * sizeof(cbd_t));
+ last_tx_bd = fep->tx_bd_base + ((fpi->tx_ring - 1) * sizeof(cbd_t));
/* get the current bd held in TBPTR and scan back from this point */
recheck_bd = curr_tbptr = (cbd_t __iomem *)
* address). Print error message but continue anyway.
*/
if ((void *)tbipa > priv->map + resource_size(&res) - 4)
- dev_err(&pdev->dev, "invalid register map (should be at least 0x%04x to contain TBI address)\n",
+ dev_err(&pdev->dev, "invalid register map (should be at least 0x%04zx to contain TBI address)\n",
((void *)tbipa - priv->map) + 4);
iowrite32be(be32_to_cpup(prop), tbipa);
FSL_GIANFAR_DEV_HAS_VLAN |
FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
FSL_GIANFAR_DEV_HAS_EXTENDED_HASH |
- FSL_GIANFAR_DEV_HAS_TIMER;
+ FSL_GIANFAR_DEV_HAS_TIMER |
+ FSL_GIANFAR_DEV_HAS_RX_FILER;
err = of_property_read_string(np, "phy-connection-type", &ctype);
priv->rx_queue[i]->rxic = DEFAULT_RXIC;
}
- /* always enable rx filer */
- priv->rx_filer_enable = 1;
+ /* Always enable rx filer if available */
+ priv->rx_filer_enable =
+ (priv->device_flags & FSL_GIANFAR_DEV_HAS_RX_FILER) ? 1 : 0;
/* Enable most messages by default */
priv->msg_enable = (NETIF_MSG_IFUP << 1 ) - 1;
/* use pritority h/w tx queue scheduling for single queue devices */
#define FSL_GIANFAR_DEV_HAS_BUF_STASHING 0x00000400
#define FSL_GIANFAR_DEV_HAS_TIMER 0x00000800
#define FSL_GIANFAR_DEV_HAS_WAKE_ON_FILER 0x00001000
+#define FSL_GIANFAR_DEV_HAS_RX_FILER 0x00002000
#if (MAXGROUPS == 2)
#define DEFAULT_MAPPING 0xAA
if (MAC_IS_ALL_ZEROS(mac_entry->addr) ||
MAC_IS_BROADCAST(mac_entry->addr) ||
MAC_IS_MULTICAST(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "set_uc %s Mac %02x:%02x:%02x:%02x:%02x:%02x err!\n",
- dsaf_dev->ae_dev.name, mac_entry->addr[0],
- mac_entry->addr[1], mac_entry->addr[2],
- mac_entry->addr[3], mac_entry->addr[4],
- mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "set_uc %s Mac %pM err!\n",
+ dsaf_dev->ae_dev.name, mac_entry->addr);
return -EINVAL;
}
/* mac addr check */
if (MAC_IS_ALL_ZEROS(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "set uc %s Mac %02x:%02x:%02x:%02x:%02x:%02x err!\n",
- dsaf_dev->ae_dev.name, mac_entry->addr[0],
- mac_entry->addr[1], mac_entry->addr[2],
- mac_entry->addr[3],
- mac_entry->addr[4], mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "set uc %s Mac %pM err!\n",
+ dsaf_dev->ae_dev.name, mac_entry->addr);
return -EINVAL;
}
/*chechk mac addr */
if (MAC_IS_ALL_ZEROS(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "set_entry failed,addr %02x:%02x:%02x:%02x:%02x:%02x!\n",
- mac_entry->addr[0], mac_entry->addr[1],
- mac_entry->addr[2], mac_entry->addr[3],
- mac_entry->addr[4], mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "set_entry failed,addr %pM!\n",
+ mac_entry->addr);
return -EINVAL;
}
/*check mac addr */
if (MAC_IS_ALL_ZEROS(addr) || MAC_IS_BROADCAST(addr)) {
- dev_err(dsaf_dev->dev,
- "del_entry failed,addr %02x:%02x:%02x:%02x:%02x:%02x!\n",
- addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+ dev_err(dsaf_dev->dev, "del_entry failed,addr %pM!\n",
+ addr);
return -EINVAL;
}
/*check mac addr */
if (MAC_IS_ALL_ZEROS(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "del_port failed, addr %02x:%02x:%02x:%02x:%02x:%02x!\n",
- mac_entry->addr[0], mac_entry->addr[1],
- mac_entry->addr[2], mac_entry->addr[3],
- mac_entry->addr[4], mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "del_port failed, addr %pM!\n",
+ mac_entry->addr);
return -EINVAL;
}
/* check macaddr */
if (MAC_IS_ALL_ZEROS(mac_entry->addr) ||
MAC_IS_BROADCAST(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "get_entry failed,addr %02x:%02x:%02x:%02x:%02x:%02x\n",
- mac_entry->addr[0], mac_entry->addr[1],
- mac_entry->addr[2], mac_entry->addr[3],
- mac_entry->addr[4], mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "get_entry failed,addr %pM\n",
+ mac_entry->addr);
return -EINVAL;
}
/*check mac addr */
if (MAC_IS_ALL_ZEROS(mac_entry->addr) ||
MAC_IS_BROADCAST(mac_entry->addr)) {
- dev_err(dsaf_dev->dev,
- "get_entry failed,addr %02x:%02x:%02x:%02x:%02x:%02x\n",
- mac_entry->addr[0], mac_entry->addr[1],
- mac_entry->addr[2], mac_entry->addr[3],
- mac_entry->addr[4], mac_entry->addr[5]);
+ dev_err(dsaf_dev->dev, "get_entry failed,addr %pM\n",
+ mac_entry->addr);
return -EINVAL;
}
#define XGMAC_PAUSE_CTL_RSP_MODE_B 2
#define XGMAC_PAUSE_CTL_TX_XOFF_B 3
-static inline void dsaf_write_reg(void *base, u32 reg, u32 value)
+static inline void dsaf_write_reg(void __iomem *base, u32 reg, u32 value)
{
u8 __iomem *reg_addr = ACCESS_ONCE(base);
#define dsaf_write_dev(a, reg, value) \
dsaf_write_reg((a)->io_base, (reg), (value))
-static inline u32 dsaf_read_reg(u8 *base, u32 reg)
+static inline u32 dsaf_read_reg(u8 __iomem *base, u32 reg)
{
u8 __iomem *reg_addr = ACCESS_ONCE(base);
#define dsaf_set_bit(origin, shift, val) \
dsaf_set_field((origin), (1ull << (shift)), (shift), (val))
-static inline void dsaf_set_reg_field(void *base, u32 reg, u32 mask, u32 shift,
- u32 val)
+static inline void dsaf_set_reg_field(void __iomem *base, u32 reg, u32 mask,
+ u32 shift, u32 val)
{
u32 origin = dsaf_read_reg(base, reg);
#define dsaf_get_bit(origin, shift) \
dsaf_get_field((origin), (1ull << (shift)), (shift))
-static inline u32 dsaf_get_reg_field(void *base, u32 reg, u32 mask, u32 shift)
+static inline u32 dsaf_get_reg_field(void __iomem *base, u32 reg, u32 mask,
+ u32 shift)
{
u32 origin;
goto init_adminq_exit;
}
- /* initialize locks */
- mutex_init(&hw->aq.asq_mutex);
- mutex_init(&hw->aq.arq_mutex);
-
/* Set up register offsets */
i40e_adminq_init_regs(hw);
i40e_shutdown_asq(hw);
i40e_shutdown_arq(hw);
- /* destroy the locks */
-
if (hw->nvm_buff.va)
i40e_free_virt_mem(hw, &hw->nvm_buff);
/* set up a default setting for link flow control */
pf->hw.fc.requested_mode = I40E_FC_NONE;
+ /* set up the locks for the AQ, do this only once in probe
+ * and destroy them only once in remove
+ */
+ mutex_init(&hw->aq.asq_mutex);
+ mutex_init(&hw->aq.arq_mutex);
+
err = i40e_init_adminq(hw);
/* provide nvm, fw, api versions */
set_bit(__I40E_DOWN, &pf->state);
del_timer_sync(&pf->service_timer);
cancel_work_sync(&pf->service_task);
- i40e_fdir_teardown(pf);
if (pf->flags & I40E_FLAG_SRIOV_ENABLED) {
i40e_free_vfs(pf);
"Failed to destroy the Admin Queue resources: %d\n",
ret_code);
+ /* destroy the locks only once, here */
+ mutex_destroy(&hw->aq.arq_mutex);
+ mutex_destroy(&hw->aq.asq_mutex);
+
/* Clear all dynamic memory lists of rings, q_vectors, and VSIs */
i40e_clear_interrupt_scheme(pf);
for (i = 0; i < pf->num_alloc_vsi; i++) {
goto init_adminq_exit;
}
- /* initialize locks */
- mutex_init(&hw->aq.asq_mutex);
- mutex_init(&hw->aq.arq_mutex);
-
/* Set up register offsets */
i40e_adminq_init_regs(hw);
i40e_shutdown_asq(hw);
i40e_shutdown_arq(hw);
- /* destroy the locks */
-
if (hw->nvm_buff.va)
i40e_free_virt_mem(hw, &hw->nvm_buff);
hw->bus.device = PCI_SLOT(pdev->devfn);
hw->bus.func = PCI_FUNC(pdev->devfn);
+ /* set up the locks for the AQ, do this only once in probe
+ * and destroy them only once in remove
+ */
+ mutex_init(&hw->aq.asq_mutex);
+ mutex_init(&hw->aq.arq_mutex);
+
INIT_LIST_HEAD(&adapter->mac_filter_list);
INIT_LIST_HEAD(&adapter->vlan_filter_list);
if (hw->aq.asq.count)
i40evf_shutdown_adminq(hw);
+ /* destroy the locks only once, here */
+ mutex_destroy(&hw->aq.arq_mutex);
+ mutex_destroy(&hw->aq.asq_mutex);
+
iounmap(hw->hw_addr);
pci_release_regions(pdev);
*/
if (netif_running(dev))
ixgbe_close(dev);
+ else
+ ixgbe_reset(adapter);
+
ixgbe_clear_interrupt_scheme(adapter);
#ifdef CONFIG_IXGBE_DCB
}
/* Free all buffers from the pool */
-static void mvpp2_bm_bufs_free(struct mvpp2 *priv, struct mvpp2_bm_pool *bm_pool)
+static void mvpp2_bm_bufs_free(struct device *dev, struct mvpp2 *priv,
+ struct mvpp2_bm_pool *bm_pool)
{
int i;
for (i = 0; i < bm_pool->buf_num; i++) {
+ dma_addr_t buf_phys_addr;
u32 vaddr;
/* Get buffer virtual address (indirect access) */
- mvpp2_read(priv, MVPP2_BM_PHY_ALLOC_REG(bm_pool->id));
+ buf_phys_addr = mvpp2_read(priv,
+ MVPP2_BM_PHY_ALLOC_REG(bm_pool->id));
vaddr = mvpp2_read(priv, MVPP2_BM_VIRT_ALLOC_REG);
+
+ dma_unmap_single(dev, buf_phys_addr,
+ bm_pool->buf_size, DMA_FROM_DEVICE);
+
if (!vaddr)
break;
dev_kfree_skb_any((struct sk_buff *)vaddr);
{
u32 val;
- mvpp2_bm_bufs_free(priv, bm_pool);
+ mvpp2_bm_bufs_free(&pdev->dev, priv, bm_pool);
if (bm_pool->buf_num) {
WARN(1, "cannot free all buffers in pool %d\n", bm_pool->id);
return 0;
MVPP2_BM_LONG_BUF_NUM :
MVPP2_BM_SHORT_BUF_NUM;
else
- mvpp2_bm_bufs_free(port->priv, new_pool);
+ mvpp2_bm_bufs_free(port->dev->dev.parent,
+ port->priv, new_pool);
new_pool->pkt_size = pkt_size;
int pkt_size = MVPP2_RX_PKT_SIZE(mtu);
/* Update BM pool with new buffer size */
- mvpp2_bm_bufs_free(port->priv, port_pool);
+ mvpp2_bm_bufs_free(dev->dev.parent, port->priv, port_pool);
if (port_pool->buf_num) {
WARN(1, "cannot free all buffers in pool %d\n", port_pool->id);
return -EIO;
mvpp2_txq_inc_get(txq_pcpu);
- if (!skb)
- continue;
-
dma_unmap_single(port->dev->dev.parent, buf_phys_addr,
skb_headlen(skb), DMA_TO_DEVICE);
+ if (!skb)
+ continue;
dev_kfree_skb_any(skb);
}
}
struct mvpp2_rx_queue *rxq)
{
struct net_device *dev = port->dev;
- int rx_received, rx_filled, i;
+ int rx_received;
+ int rx_done = 0;
u32 rcvd_pkts = 0;
u32 rcvd_bytes = 0;
if (rx_todo > rx_received)
rx_todo = rx_received;
- rx_filled = 0;
- for (i = 0; i < rx_todo; i++) {
+ while (rx_done < rx_todo) {
struct mvpp2_rx_desc *rx_desc = mvpp2_rxq_next_desc_get(rxq);
struct mvpp2_bm_pool *bm_pool;
struct sk_buff *skb;
+ dma_addr_t phys_addr;
u32 bm, rx_status;
int pool, rx_bytes, err;
- rx_filled++;
+ rx_done++;
rx_status = rx_desc->status;
rx_bytes = rx_desc->data_size - MVPP2_MH_SIZE;
+ phys_addr = rx_desc->buf_phys_addr;
bm = mvpp2_bm_cookie_build(rx_desc);
pool = mvpp2_bm_cookie_pool_get(bm);
* comprised by the RX descriptor.
*/
if (rx_status & MVPP2_RXD_ERR_SUMMARY) {
+ err_drop_frame:
dev->stats.rx_errors++;
mvpp2_rx_error(port, rx_desc);
+ /* Return the buffer to the pool */
mvpp2_pool_refill(port, bm, rx_desc->buf_phys_addr,
rx_desc->buf_cookie);
continue;
skb = (struct sk_buff *)rx_desc->buf_cookie;
+ err = mvpp2_rx_refill(port, bm_pool, bm, 0);
+ if (err) {
+ netdev_err(port->dev, "failed to refill BM pools\n");
+ goto err_drop_frame;
+ }
+
+ dma_unmap_single(dev->dev.parent, phys_addr,
+ bm_pool->buf_size, DMA_FROM_DEVICE);
+
rcvd_pkts++;
rcvd_bytes += rx_bytes;
atomic_inc(&bm_pool->in_use);
mvpp2_rx_csum(port, rx_status, skb);
napi_gro_receive(&port->napi, skb);
-
- err = mvpp2_rx_refill(port, bm_pool, bm, 0);
- if (err) {
- netdev_err(port->dev, "failed to refill BM pools\n");
- rx_filled--;
- }
}
if (rcvd_pkts) {
/* Update Rx queue management counters */
wmb();
- mvpp2_rxq_status_update(port, rxq->id, rx_todo, rx_filled);
+ mvpp2_rxq_status_update(port, rxq->id, rx_done, rx_done);
return rx_todo;
}
unsigned long flags;
u64 ns, zero = 0;
+ /* mlx4_en_init_timestamp is called for each netdev.
+ * mdev->ptp_clock is common for all ports, skip initialization if
+ * was done for other port.
+ */
+ if (mdev->ptp_clock)
+ return;
+
rwlock_init(&mdev->clock_lock);
memset(&mdev->cycles, 0, sizeof(mdev->cycles));
if (mdev->pndev[i])
mlx4_en_destroy_netdev(mdev->pndev[i]);
- if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
- mlx4_en_remove_timestamp(mdev);
-
flush_workqueue(mdev->workqueue);
destroy_workqueue(mdev->workqueue);
(void) mlx4_mr_free(dev, &mdev->mr);
mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
mdev->port_cnt++;
- /* Initialize time stamp mechanism */
- if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
- mlx4_en_init_timestamp(mdev);
-
/* Set default number of RX rings*/
mlx4_en_set_num_rx_rings(mdev);
/* flush any pending task for this netdev */
flush_workqueue(mdev->workqueue);
+ if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+ mlx4_en_remove_timestamp(mdev);
+
/* Detach the netdev so tasks would not attempt to access it */
mutex_lock(&mdev->state_lock);
mdev->pndev[priv->port] = NULL;
}
queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY);
+ /* Initialize time stamp mechanism */
if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
- queue_delayed_work(mdev->workqueue, &priv->service_task,
- SERVICE_TASK_DELAY);
+ mlx4_en_init_timestamp(mdev);
+
+ queue_delayed_work(mdev->workqueue, &priv->service_task,
+ SERVICE_TASK_DELAY);
mlx4_en_set_stats_bitmap(mdev->dev, &priv->stats_bitmap,
mdev->profile.prof[priv->port].rx_ppp,
return -EOPNOTSUPP;
ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf;
- ctrl->port = mlx4_slave_convert_port(dev, slave, ctrl->port);
- if (ctrl->port <= 0)
+ err = mlx4_slave_convert_port(dev, slave, ctrl->port);
+ if (err <= 0)
return -EINVAL;
+ ctrl->port = err;
qpn = be32_to_cpu(ctrl->qpn) & 0xffffff;
err = get_res(dev, slave, qpn, RES_QP, &rqp);
if (err) {
break; /* Better luck next round. */
np->rx_dma[entry] = pci_map_single(np->pci_dev,
skb->data, buflen, PCI_DMA_FROMDEVICE);
+ if (pci_dma_mapping_error(np->pci_dev,
+ np->rx_dma[entry])) {
+ dev_kfree_skb_any(skb);
+ np->rx_skbuff[entry] = NULL;
+ break; /* Better luck next round. */
+ }
np->rx_ring[entry].addr = cpu_to_le32(np->rx_dma[entry]);
}
np->rx_ring[entry].cmd_status = cpu_to_le32(np->rx_buf_sz);
np->tx_skbuff[entry] = skb;
np->tx_dma[entry] = pci_map_single(np->pci_dev,
skb->data,skb->len, PCI_DMA_TODEVICE);
+ if (pci_dma_mapping_error(np->pci_dev, np->tx_dma[entry])) {
+ np->tx_skbuff[entry] = NULL;
+ dev_kfree_skb_irq(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+ }
np->tx_ring[entry].addr = cpu_to_le32(np->tx_dma[entry]);
/* Flag indicating whether interrupts are enabled or not*/
bool b_int_enabled;
+ bool b_int_requested;
struct qed_mcp_info *mcp_info;
u32 input_len, u8 *input_buf,
u32 max_size, u8 *unzip_buf);
+int qed_slowpath_irq_req(struct qed_hwfn *hwfn);
+
#define QED_ETH_INTERFACE_VERSION 300
#endif /* _QED_H */
return rc;
}
-static u32 qed_hw_bar_size(struct qed_dev *cdev,
- u8 bar_id)
+static u32 qed_hw_bar_size(struct qed_hwfn *p_hwfn,
+ u8 bar_id)
{
- u32 size = pci_resource_len(cdev->pdev, (bar_id > 0) ? 2 : 0);
+ u32 bar_reg = (bar_id == 0 ? PGLUE_B_REG_PF_BAR0_SIZE
+ : PGLUE_B_REG_PF_BAR1_SIZE);
+ u32 val = qed_rd(p_hwfn, p_hwfn->p_main_ptt, bar_reg);
- return size / cdev->num_hwfns;
+ /* Get the BAR size(in KB) from hardware given val */
+ return 1 << (val + 15);
}
int qed_hw_prepare(struct qed_dev *cdev,
int personality)
{
- int rc, i;
+ struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+ int rc;
/* Store the precompiled init data ptrs */
qed_init_iro_array(cdev);
/* Initialize the first hwfn - will learn number of hwfns */
- rc = qed_hw_prepare_single(&cdev->hwfns[0], cdev->regview,
+ rc = qed_hw_prepare_single(p_hwfn,
+ cdev->regview,
cdev->doorbells, personality);
if (rc)
return rc;
- personality = cdev->hwfns[0].hw_info.personality;
+ personality = p_hwfn->hw_info.personality;
/* Initialize the rest of the hwfns */
- for (i = 1; i < cdev->num_hwfns; i++) {
+ if (cdev->num_hwfns > 1) {
void __iomem *p_regview, *p_doorbell;
+ u8 __iomem *addr;
+
+ /* adjust bar offset for second engine */
+ addr = cdev->regview + qed_hw_bar_size(p_hwfn, 0) / 2;
+ p_regview = addr;
- p_regview = cdev->regview +
- i * qed_hw_bar_size(cdev, 0);
- p_doorbell = cdev->doorbells +
- i * qed_hw_bar_size(cdev, 1);
- rc = qed_hw_prepare_single(&cdev->hwfns[i], p_regview,
+ /* adjust doorbell bar offset for second engine */
+ addr = cdev->doorbells + qed_hw_bar_size(p_hwfn, 1) / 2;
+ p_doorbell = addr;
+
+ /* prepare second hw function */
+ rc = qed_hw_prepare_single(&cdev->hwfns[1], p_regview,
p_doorbell, personality);
+
+ /* in case of error, need to free the previously
+ * initiliazed hwfn 0.
+ */
if (rc) {
- /* Cleanup previously initialized hwfns */
- while (--i >= 0) {
- qed_init_free(&cdev->hwfns[i]);
- qed_mcp_free(&cdev->hwfns[i]);
- qed_hw_hwfn_free(&cdev->hwfns[i]);
- }
- return rc;
+ qed_init_free(p_hwfn);
+ qed_mcp_free(p_hwfn);
+ qed_hw_hwfn_free(p_hwfn);
}
}
- return 0;
+ return rc;
}
void qed_hw_remove(struct qed_dev *cdev)
qed_wr(p_hwfn, p_ptt, IGU_REG_PF_CONFIGURATION, igu_pf_conf);
}
-void qed_int_igu_enable(struct qed_hwfn *p_hwfn,
- struct qed_ptt *p_ptt,
- enum qed_int_mode int_mode)
+int qed_int_igu_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+ enum qed_int_mode int_mode)
{
- int i;
-
- p_hwfn->b_int_enabled = 1;
+ int rc, i;
/* Mask non-link attentions */
for (i = 0; i < 9; i++)
qed_wr(p_hwfn, p_ptt,
MISC_REG_AEU_ENABLE1_IGU_OUT_0 + (i << 2), 0);
- /* Enable interrupt Generation */
- qed_int_igu_enable_int(p_hwfn, p_ptt, int_mode);
-
/* Configure AEU signal change to produce attentions for link */
qed_wr(p_hwfn, p_ptt, IGU_REG_LEADING_EDGE_LATCH, 0xfff);
qed_wr(p_hwfn, p_ptt, IGU_REG_TRAILING_EDGE_LATCH, 0xfff);
/* Unmask AEU signals toward IGU */
qed_wr(p_hwfn, p_ptt, MISC_REG_AEU_MASK_ATTN_IGU, 0xff);
+ if ((int_mode != QED_INT_MODE_INTA) || IS_LEAD_HWFN(p_hwfn)) {
+ rc = qed_slowpath_irq_req(p_hwfn);
+ if (rc != 0) {
+ DP_NOTICE(p_hwfn, "Slowpath IRQ request failed\n");
+ return -EINVAL;
+ }
+ p_hwfn->b_int_requested = true;
+ }
+ /* Enable interrupt Generation */
+ qed_int_igu_enable_int(p_hwfn, p_ptt, int_mode);
+ p_hwfn->b_int_enabled = 1;
+
+ return rc;
}
void qed_int_igu_disable_int(struct qed_hwfn *p_hwfn,
return info->igu_sb_cnt;
}
+
+void qed_int_disable_post_isr_release(struct qed_dev *cdev)
+{
+ int i;
+
+ for_each_hwfn(cdev, i)
+ cdev->hwfns[i].b_int_requested = false;
+}
int *p_iov_blks);
/**
- * @file
+ * @brief qed_int_disable_post_isr_release - performs the cleanup post ISR
+ * release. The API need to be called after releasing all slowpath IRQs
+ * of the device.
+ *
+ * @param cdev
*
- * @brief Interrupt handler
*/
+void qed_int_disable_post_isr_release(struct qed_dev *cdev);
#define QED_CAU_DEF_RX_TIMER_RES 0
#define QED_CAU_DEF_TX_TIMER_RES 0
* @param p_hwfn
* @param p_ptt
* @param int_mode
+ *
+ * @return int
*/
-void qed_int_igu_enable(struct qed_hwfn *p_hwfn,
- struct qed_ptt *p_ptt,
- enum qed_int_mode int_mode);
+int qed_int_igu_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+ enum qed_int_mode int_mode);
/**
* @brief - Initialize CAU status block entry
return rc;
}
-static int qed_slowpath_irq_req(struct qed_dev *cdev)
+int qed_slowpath_irq_req(struct qed_hwfn *hwfn)
{
- int i = 0, rc = 0;
+ struct qed_dev *cdev = hwfn->cdev;
+ int rc = 0;
+ u8 id;
if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
- /* Request all the slowpath MSI-X vectors */
- for (i = 0; i < cdev->num_hwfns; i++) {
- snprintf(cdev->hwfns[i].name, NAME_SIZE,
- "sp-%d-%02x:%02x.%02x",
- i, cdev->pdev->bus->number,
- PCI_SLOT(cdev->pdev->devfn),
- cdev->hwfns[i].abs_pf_id);
-
- rc = request_irq(cdev->int_params.msix_table[i].vector,
- qed_msix_sp_int, 0,
- cdev->hwfns[i].name,
- cdev->hwfns[i].sp_dpc);
- if (rc)
- break;
-
- DP_VERBOSE(&cdev->hwfns[i],
- (NETIF_MSG_INTR | QED_MSG_SP),
+ id = hwfn->my_id;
+ snprintf(hwfn->name, NAME_SIZE, "sp-%d-%02x:%02x.%02x",
+ id, cdev->pdev->bus->number,
+ PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id);
+ rc = request_irq(cdev->int_params.msix_table[id].vector,
+ qed_msix_sp_int, 0, hwfn->name, hwfn->sp_dpc);
+ if (!rc)
+ DP_VERBOSE(hwfn, (NETIF_MSG_INTR | QED_MSG_SP),
"Requested slowpath MSI-X\n");
- }
-
- if (i != cdev->num_hwfns) {
- /* Free already request MSI-X vectors */
- for (i--; i >= 0; i--) {
- unsigned int vec =
- cdev->int_params.msix_table[i].vector;
- synchronize_irq(vec);
- free_irq(cdev->int_params.msix_table[i].vector,
- cdev->hwfns[i].sp_dpc);
- }
- }
} else {
unsigned long flags = 0;
if (cdev->int_params.out.int_mode == QED_INT_MODE_MSIX) {
for_each_hwfn(cdev, i) {
+ if (!cdev->hwfns[i].b_int_requested)
+ break;
synchronize_irq(cdev->int_params.msix_table[i].vector);
free_irq(cdev->int_params.msix_table[i].vector,
cdev->hwfns[i].sp_dpc);
}
} else {
- free_irq(cdev->pdev->irq, cdev);
+ if (QED_LEADING_HWFN(cdev)->b_int_requested)
+ free_irq(cdev->pdev->irq, cdev);
}
+ qed_int_disable_post_isr_release(cdev);
}
static int qed_nic_stop(struct qed_dev *cdev)
if (rc)
goto err1;
- /* Request the slowpath IRQ */
- rc = qed_slowpath_irq_req(cdev);
- if (rc)
- goto err2;
-
/* Allocate stream for unzipping */
rc = qed_alloc_stream_mem(cdev);
if (rc) {
DP_NOTICE(cdev, "Failed to allocate stream memory\n");
- goto err3;
+ goto err2;
}
/* Start the slowpath */
0x7 << 0)
#define MCP_REG_NVM_CFG4_FLASH_SIZE_SHIFT \
0
+#define PGLUE_B_REG_PF_BAR0_SIZE \
+ 0x2aae60UL
+#define PGLUE_B_REG_PF_BAR1_SIZE \
+ 0x2aae64UL
#endif
dma_addr_t p_phys;
struct qed_spq_entry *p_virt;
- /* Used as index for completions (returns on EQ by FW) */
- u16 echo_idx;
+#define SPQ_RING_SIZE \
+ (CORE_SPQE_PAGE_SIZE_BYTES / sizeof(struct slow_path_element))
+
+ /* Bitmap for handling out-of-order completions */
+ DECLARE_BITMAP(p_comp_bitmap, SPQ_RING_SIZE);
+ u8 comp_bitmap_idx;
/* Statistics */
u32 unlimited_pending_count;
qed_spq_fill_entry(struct qed_hwfn *p_hwfn,
struct qed_spq_entry *p_ent)
{
- p_ent->elem.hdr.echo = 0;
- p_hwfn->p_spq->echo_idx++;
p_ent->flags = 0;
switch (p_ent->comp_mode) {
struct qed_spq *p_spq,
struct qed_spq_entry *p_ent)
{
- struct qed_chain *p_chain = &p_hwfn->p_spq->chain;
+ struct qed_chain *p_chain = &p_hwfn->p_spq->chain;
+ u16 echo = qed_chain_get_prod_idx(p_chain);
struct slow_path_element *elem;
struct core_db_data db;
+ p_ent->elem.hdr.echo = cpu_to_le16(echo);
elem = qed_chain_produce(p_chain);
if (!elem) {
DP_NOTICE(p_hwfn, "Failed to produce from SPQ chain\n");
p_spq->comp_count = 0;
p_spq->comp_sent_count = 0;
p_spq->unlimited_pending_count = 0;
- p_spq->echo_idx = 0;
+
+ bitmap_zero(p_spq->p_comp_bitmap, SPQ_RING_SIZE);
+ p_spq->comp_bitmap_idx = 0;
/* SPQ cid, cannot fail */
qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_spq->cid);
struct qed_spq *p_spq = p_hwfn->p_spq;
if (p_ent->queue == &p_spq->unlimited_pending) {
- struct qed_spq_entry *p_en2;
if (list_empty(&p_spq->free_pool)) {
list_add_tail(&p_ent->list, &p_spq->unlimited_pending);
p_spq->unlimited_pending_count++;
return 0;
- }
+ } else {
+ struct qed_spq_entry *p_en2;
- p_en2 = list_first_entry(&p_spq->free_pool,
- struct qed_spq_entry,
- list);
- list_del(&p_en2->list);
+ p_en2 = list_first_entry(&p_spq->free_pool,
+ struct qed_spq_entry,
+ list);
+ list_del(&p_en2->list);
+
+ /* Copy the ring element physical pointer to the new
+ * entry, since we are about to override the entire ring
+ * entry and don't want to lose the pointer.
+ */
+ p_ent->elem.data_ptr = p_en2->elem.data_ptr;
- /* Strcut assignment */
- *p_en2 = *p_ent;
+ *p_en2 = *p_ent;
- kfree(p_ent);
+ kfree(p_ent);
- p_ent = p_en2;
+ p_ent = p_en2;
+ }
}
/* entry is to be placed in 'pending' queue */
list_for_each_entry_safe(p_ent, tmp, &p_spq->completion_pending,
list) {
if (p_ent->elem.hdr.echo == echo) {
+ u16 pos = le16_to_cpu(echo) % SPQ_RING_SIZE;
+
list_del(&p_ent->list);
- qed_chain_return_produced(&p_spq->chain);
+ /* Avoid overriding of SPQ entries when getting
+ * out-of-order completions, by marking the completions
+ * in a bitmap and increasing the chain consumer only
+ * for the first successive completed entries.
+ */
+ bitmap_set(p_spq->p_comp_bitmap, pos, SPQ_RING_SIZE);
+
+ while (test_bit(p_spq->comp_bitmap_idx,
+ p_spq->p_comp_bitmap)) {
+ bitmap_clear(p_spq->p_comp_bitmap,
+ p_spq->comp_bitmap_idx,
+ SPQ_RING_SIZE);
+ p_spq->comp_bitmap_idx++;
+ qed_chain_return_produced(&p_spq->chain);
+ }
+
p_spq->comp_count++;
found = p_ent;
break;
}
+
+ /* This is relatively uncommon - depends on scenarios
+ * which have mutliple per-PF sent ramrods.
+ */
+ DP_VERBOSE(p_hwfn, QED_MSG_SPQ,
+ "Got completion for echo %04x - doesn't match echo %04x in completion pending list\n",
+ le16_to_cpu(echo),
+ le16_to_cpu(p_ent->elem.hdr.echo));
}
/* Release lock before callback, as callback may post
u32 state;
state = QLCRDX(ahw, QLC_83XX_VNIC_STATE);
- while (state != QLCNIC_DEV_NPAR_OPER && idc->vnic_wait_limit--) {
+ while (state != QLCNIC_DEV_NPAR_OPER && idc->vnic_wait_limit) {
+ idc->vnic_wait_limit--;
msleep(1000);
state = QLCRDX(ahw, QLC_83XX_VNIC_STATE);
}
- if (!idc->vnic_wait_limit) {
+ if (state != QLCNIC_DEV_NPAR_OPER) {
dev_err(&adapter->pdev->dev,
"vNIC mode not operational, state check timed out.\n");
return -EIO;
int i, err = 0;
for (i = 0; i < ahw->num_msix; i++) {
- qlcnic_alloc_mbx_args(&cmd, adapter,
- QLCNIC_CMD_MQ_TX_CONFIG_INTR);
+ err = qlcnic_alloc_mbx_args(&cmd, adapter,
+ QLCNIC_CMD_MQ_TX_CONFIG_INTR);
+ if (err)
+ return err;
type = op_type ? QLCNIC_INTRPT_ADD : QLCNIC_INTRPT_DEL;
val = type | (ahw->intr_tbl[i].type << 4);
if (ahw->intr_tbl[i].type == QLCNIC_INTRPT_MSIX)
/* Wait for an outstanding reset to complete. */
if (!test_bit(QL_ADAPTER_UP, &qdev->flags)) {
- int i = 3;
- while (i-- && !test_bit(QL_ADAPTER_UP, &qdev->flags)) {
+ int i = 4;
+
+ while (--i && !test_bit(QL_ADAPTER_UP, &qdev->flags)) {
netif_err(qdev, ifup, qdev->ndev,
"Waiting for adapter UP...\n");
ssleep(1);
netdev_info(qca->net_dev, "Transmit timeout at %ld, latency %ld\n",
jiffies, jiffies - dev->trans_start);
qca->net_dev->stats.tx_errors++;
- /* wake the queue if there is room */
- if (qcaspi_tx_ring_has_space(&qca->txr))
- netif_wake_queue(dev);
+ /* Trigger tx queue flush and QCA7000 reset */
+ qca->sync = QCASPI_SYNC_UNKNOWN;
}
static int
netdev_info(ndev, "limited PHY to 100Mbit/s\n");
}
+ /* 10BASE is not supported */
+ phydev->supported &= ~PHY_10BT_FEATURES;
+
netdev_info(ndev, "attached PHY %d (IRQ %d) to driver %s\n",
phydev->addr, phydev->irq, phydev->drv->name);
"rx_queue_1_mcast_packets",
"rx_queue_1_errors",
"rx_queue_1_crc_errors",
- "rx_queue_1_frame_errors_",
+ "rx_queue_1_frame_errors",
"rx_queue_1_length_errors",
"rx_queue_1_missed_errors",
"rx_queue_1_over_errors",
NETIF_MSG_RX_ERR| \
NETIF_MSG_TX_ERR)
+#define SH_ETH_OFFSET_INVALID ((u16)~0)
+
#define SH_ETH_OFFSET_DEFAULTS \
[0 ... SH_ETH_MAX_REGISTER_OFFSET - 1] = SH_ETH_OFFSET_INVALID
static void sh_eth_rcv_snd_disable(struct net_device *ndev);
static struct net_device_stats *sh_eth_get_stats(struct net_device *ndev);
+static void sh_eth_write(struct net_device *ndev, u32 data, int enum_index)
+{
+ struct sh_eth_private *mdp = netdev_priv(ndev);
+ u16 offset = mdp->reg_offset[enum_index];
+
+ if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+ return;
+
+ iowrite32(data, mdp->addr + offset);
+}
+
+static u32 sh_eth_read(struct net_device *ndev, int enum_index)
+{
+ struct sh_eth_private *mdp = netdev_priv(ndev);
+ u16 offset = mdp->reg_offset[enum_index];
+
+ if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
+ return ~0U;
+
+ return ioread32(mdp->addr + offset);
+}
+
static bool sh_eth_is_gether(struct sh_eth_private *mdp)
{
return mdp->reg_offset == sh_eth_offset_gigabit;
int tx_ringsize = sizeof(*txdesc) * mdp->num_tx_ring;
int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN + 32 - 1;
dma_addr_t dma_addr;
+ u32 buf_len;
mdp->cur_rx = 0;
mdp->cur_tx = 0;
/* RX descriptor */
rxdesc = &mdp->rx_ring[i];
/* The size of the buffer is a multiple of 32 bytes. */
- rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 32);
- dma_addr = dma_map_single(&ndev->dev, skb->data,
- rxdesc->buffer_length,
+ buf_len = ALIGN(mdp->rx_buf_sz, 32);
+ rxdesc->len = cpu_to_edmac(mdp, buf_len << 16);
+ dma_addr = dma_map_single(&ndev->dev, skb->data, buf_len,
DMA_FROM_DEVICE);
if (dma_mapping_error(&ndev->dev, dma_addr)) {
kfree_skb(skb);
break;
}
mdp->rx_skbuff[i] = skb;
- rxdesc->addr = dma_addr;
+ rxdesc->addr = cpu_to_edmac(mdp, dma_addr);
rxdesc->status = cpu_to_edmac(mdp, RD_RACT | RD_RFP);
/* Rx descriptor address set */
mdp->tx_skbuff[i] = NULL;
txdesc = &mdp->tx_ring[i];
txdesc->status = cpu_to_edmac(mdp, TD_TFP);
- txdesc->buffer_length = 0;
+ txdesc->len = cpu_to_edmac(mdp, 0);
if (i == 0) {
/* Tx descriptor address set */
sh_eth_write(ndev, mdp->tx_desc_dma, TDLAR);
entry, edmac_to_cpu(mdp, txdesc->status));
/* Free the original skb. */
if (mdp->tx_skbuff[entry]) {
- dma_unmap_single(&ndev->dev, txdesc->addr,
- txdesc->buffer_length, DMA_TO_DEVICE);
+ dma_unmap_single(&ndev->dev,
+ edmac_to_cpu(mdp, txdesc->addr),
+ edmac_to_cpu(mdp, txdesc->len) >> 16,
+ DMA_TO_DEVICE);
dev_kfree_skb_irq(mdp->tx_skbuff[entry]);
mdp->tx_skbuff[entry] = NULL;
free_num++;
txdesc->status |= cpu_to_edmac(mdp, TD_TDLE);
ndev->stats.tx_packets++;
- ndev->stats.tx_bytes += txdesc->buffer_length;
+ ndev->stats.tx_bytes += edmac_to_cpu(mdp, txdesc->len) >> 16;
}
return free_num;
}
u32 desc_status;
int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN + 32 - 1;
dma_addr_t dma_addr;
+ u32 buf_len;
boguscnt = min(boguscnt, *quota);
limit = boguscnt;
/* RACT bit must be checked before all the following reads */
dma_rmb();
desc_status = edmac_to_cpu(mdp, rxdesc->status);
- pkt_len = rxdesc->frame_length;
+ pkt_len = edmac_to_cpu(mdp, rxdesc->len) & RD_RFL;
if (--boguscnt < 0)
break;
if (mdp->cd->shift_rd0)
desc_status >>= 16;
+ skb = mdp->rx_skbuff[entry];
if (desc_status & (RD_RFS1 | RD_RFS2 | RD_RFS3 | RD_RFS4 |
RD_RFS5 | RD_RFS6 | RD_RFS10)) {
ndev->stats.rx_errors++;
ndev->stats.rx_missed_errors++;
if (desc_status & RD_RFS10)
ndev->stats.rx_over_errors++;
- } else {
+ } else if (skb) {
+ dma_addr = edmac_to_cpu(mdp, rxdesc->addr);
if (!mdp->cd->hw_swap)
sh_eth_soft_swap(
- phys_to_virt(ALIGN(rxdesc->addr, 4)),
+ phys_to_virt(ALIGN(dma_addr, 4)),
pkt_len + 2);
- skb = mdp->rx_skbuff[entry];
mdp->rx_skbuff[entry] = NULL;
if (mdp->cd->rpadir)
skb_reserve(skb, NET_IP_ALIGN);
- dma_unmap_single(&ndev->dev, rxdesc->addr,
+ dma_unmap_single(&ndev->dev, dma_addr,
ALIGN(mdp->rx_buf_sz, 32),
DMA_FROM_DEVICE);
skb_put(skb, pkt_len);
entry = mdp->dirty_rx % mdp->num_rx_ring;
rxdesc = &mdp->rx_ring[entry];
/* The size of the buffer is 32 byte boundary. */
- rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 32);
+ buf_len = ALIGN(mdp->rx_buf_sz, 32);
+ rxdesc->len = cpu_to_edmac(mdp, buf_len << 16);
if (mdp->rx_skbuff[entry] == NULL) {
skb = netdev_alloc_skb(ndev, skbuff_size);
break; /* Better luck next round. */
sh_eth_set_receive_align(skb);
dma_addr = dma_map_single(&ndev->dev, skb->data,
- rxdesc->buffer_length,
- DMA_FROM_DEVICE);
+ buf_len, DMA_FROM_DEVICE);
if (dma_mapping_error(&ndev->dev, dma_addr)) {
kfree_skb(skb);
break;
mdp->rx_skbuff[entry] = skb;
skb_checksum_none_assert(skb);
- rxdesc->addr = dma_addr;
+ rxdesc->addr = cpu_to_edmac(mdp, dma_addr);
}
dma_wmb(); /* RACT bit must be set after all the above writes */
if (entry >= mdp->num_rx_ring - 1)
/* Free all the skbuffs in the Rx queue. */
for (i = 0; i < mdp->num_rx_ring; i++) {
rxdesc = &mdp->rx_ring[i];
- rxdesc->status = 0;
- rxdesc->addr = 0xBADF00D0;
+ rxdesc->status = cpu_to_edmac(mdp, 0);
+ rxdesc->addr = cpu_to_edmac(mdp, 0xBADF00D0);
dev_kfree_skb(mdp->rx_skbuff[i]);
mdp->rx_skbuff[i] = NULL;
}
{
struct sh_eth_private *mdp = netdev_priv(ndev);
struct sh_eth_txdesc *txdesc;
+ dma_addr_t dma_addr;
u32 entry;
unsigned long flags;
txdesc = &mdp->tx_ring[entry];
/* soft swap. */
if (!mdp->cd->hw_swap)
- sh_eth_soft_swap(phys_to_virt(ALIGN(txdesc->addr, 4)),
- skb->len + 2);
- txdesc->addr = dma_map_single(&ndev->dev, skb->data, skb->len,
- DMA_TO_DEVICE);
- if (dma_mapping_error(&ndev->dev, txdesc->addr)) {
+ sh_eth_soft_swap(PTR_ALIGN(skb->data, 4), skb->len + 2);
+ dma_addr = dma_map_single(&ndev->dev, skb->data, skb->len,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(&ndev->dev, dma_addr)) {
kfree_skb(skb);
return NETDEV_TX_OK;
}
- txdesc->buffer_length = skb->len;
+ txdesc->addr = cpu_to_edmac(mdp, dma_addr);
+ txdesc->len = cpu_to_edmac(mdp, skb->len << 16);
dma_wmb(); /* TACT bit must be set after all the above writes */
if (entry >= mdp->num_tx_ring - 1)
DMAC_M_RINT1 = 0x00000001,
};
-/* Receive descriptor bit */
+/* Receive descriptor 0 bits */
enum RD_STS_BIT {
RD_RACT = 0x80000000, RD_RDLE = 0x40000000,
RD_RFP1 = 0x20000000, RD_RFP0 = 0x10000000,
#define RDFEND RD_RFP0
#define RD_RFP (RD_RFP1|RD_RFP0)
+/* Receive descriptor 1 bits */
+enum RD_LEN_BIT {
+ RD_RFL = 0x0000ffff, /* receive frame length */
+ RD_RBL = 0xffff0000, /* receive buffer length */
+};
+
/* FCFTR */
enum FCFTR_BIT {
FCFTR_RFF2 = 0x00040000, FCFTR_RFF1 = 0x00020000,
#define DEFAULT_FIFO_F_D_RFF (FCFTR_RFF2 | FCFTR_RFF1 | FCFTR_RFF0)
#define DEFAULT_FIFO_F_D_RFD (FCFTR_RFD2 | FCFTR_RFD1 | FCFTR_RFD0)
-/* Transmit descriptor bit */
+/* Transmit descriptor 0 bits */
enum TD_STS_BIT {
TD_TACT = 0x80000000, TD_TDLE = 0x40000000,
TD_TFP1 = 0x20000000, TD_TFP0 = 0x10000000,
#define TDFEND TD_TFP0
#define TD_TFP (TD_TFP1|TD_TFP0)
+/* Transmit descriptor 1 bits */
+enum TD_LEN_BIT {
+ TD_TBL = 0xffff0000, /* transmit buffer length */
+};
+
/* RMCR */
enum RMCR_BIT {
RMCR_RNC = 0x00000001,
*/
struct sh_eth_txdesc {
u32 status; /* TD0 */
-#if defined(__LITTLE_ENDIAN)
- u16 pad0; /* TD1 */
- u16 buffer_length; /* TD1 */
-#else
- u16 buffer_length; /* TD1 */
- u16 pad0; /* TD1 */
-#endif
+ u32 len; /* TD1 */
u32 addr; /* TD2 */
- u32 pad1; /* padding data */
+ u32 pad0; /* padding data */
} __aligned(2) __packed;
/* The sh ether Rx buffer descriptors.
*/
struct sh_eth_rxdesc {
u32 status; /* RD0 */
-#if defined(__LITTLE_ENDIAN)
- u16 frame_length; /* RD1 */
- u16 buffer_length; /* RD1 */
-#else
- u16 buffer_length; /* RD1 */
- u16 frame_length; /* RD1 */
-#endif
+ u32 len; /* RD1 */
u32 addr; /* RD2 */
u32 pad0; /* padding data */
} __aligned(2) __packed;
#endif
}
-#define SH_ETH_OFFSET_INVALID ((u16) ~0)
-
-static inline void sh_eth_write(struct net_device *ndev, u32 data,
- int enum_index)
-{
- struct sh_eth_private *mdp = netdev_priv(ndev);
- u16 offset = mdp->reg_offset[enum_index];
-
- if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
- return;
-
- iowrite32(data, mdp->addr + offset);
-}
-
-static inline u32 sh_eth_read(struct net_device *ndev, int enum_index)
-{
- struct sh_eth_private *mdp = netdev_priv(ndev);
- u16 offset = mdp->reg_offset[enum_index];
-
- if (WARN_ON(offset == SH_ETH_OFFSET_INVALID))
- return ~0U;
-
- return ioread32(mdp->addr + offset);
-}
-
static inline void *sh_eth_tsu_get_offset(struct sh_eth_private *mdp,
int enum_index)
{
new_spec.priority = EFX_FILTER_PRI_AUTO;
new_spec.flags = (EFX_FILTER_FLAG_RX |
- EFX_FILTER_FLAG_RX_RSS);
+ (efx_rss_enabled(efx) ?
+ EFX_FILTER_FLAG_RX_RSS : 0));
new_spec.dmaq_id = 0;
new_spec.rss_context = EFX_FILTER_RSS_CONTEXT_DEFAULT;
rc = efx_ef10_filter_push(efx, &new_spec,
{
struct efx_ef10_filter_table *table = efx->filter_state;
struct efx_ef10_dev_addr *addr_list;
+ enum efx_filter_flags filter_flags;
struct efx_filter_spec spec;
u8 baddr[ETH_ALEN];
unsigned int i, j;
addr_count = table->dev_uc_count;
}
+ filter_flags = efx_rss_enabled(efx) ? EFX_FILTER_FLAG_RX_RSS : 0;
+
/* Insert/renew filters */
for (i = 0; i < addr_count; i++) {
- efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO,
- EFX_FILTER_FLAG_RX_RSS,
- 0);
+ efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0);
efx_filter_set_eth_local(&spec, EFX_FILTER_VID_UNSPEC,
addr_list[i].addr);
rc = efx_ef10_filter_insert(efx, &spec, true);
if (multicast && rollback) {
/* Also need an Ethernet broadcast filter */
- efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO,
- EFX_FILTER_FLAG_RX_RSS,
- 0);
+ efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0);
eth_broadcast_addr(baddr);
efx_filter_set_eth_local(&spec, EFX_FILTER_VID_UNSPEC, baddr);
rc = efx_ef10_filter_insert(efx, &spec, true);
{
struct efx_ef10_filter_table *table = efx->filter_state;
struct efx_ef10_nic_data *nic_data = efx->nic_data;
+ enum efx_filter_flags filter_flags;
struct efx_filter_spec spec;
u8 baddr[ETH_ALEN];
int rc;
- efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO,
- EFX_FILTER_FLAG_RX_RSS,
- 0);
+ filter_flags = efx_rss_enabled(efx) ? EFX_FILTER_FLAG_RX_RSS : 0;
+
+ efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO, filter_flags, 0);
if (multicast)
efx_filter_set_mc_def(&spec);
if (!nic_data->workaround_26807) {
/* Also need an Ethernet broadcast filter */
efx_filter_init_rx(&spec, EFX_FILTER_PRI_AUTO,
- EFX_FILTER_FLAG_RX_RSS,
- 0);
+ filter_flags, 0);
eth_broadcast_addr(baddr);
efx_filter_set_eth_local(&spec, EFX_FILTER_VID_UNSPEC,
baddr);
#define EFX_TXQ_MAX_ENT(efx) (EFX_WORKAROUND_35388(efx) ? \
EFX_MAX_DMAQ_SIZE / 2 : EFX_MAX_DMAQ_SIZE)
+static inline bool efx_rss_enabled(struct efx_nic *efx)
+{
+ return efx->rss_spread > 1;
+}
+
/* Filters */
void efx_mac_reconfigure(struct efx_nic *efx);
*/
spec->priority = EFX_FILTER_PRI_AUTO;
spec->flags = (EFX_FILTER_FLAG_RX |
- (efx->n_rx_channels > 1 ? EFX_FILTER_FLAG_RX_RSS : 0) |
+ (efx_rss_enabled(efx) ? EFX_FILTER_FLAG_RX_RSS : 0) |
(efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0));
spec->dmaq_id = 0;
}
val |= (1 << TXC_GLCMD_LMTSWRST_LBN);
efx_mdio_write(efx, mmd, TXC_GLRGS_GLCMD, val);
- while (tries--) {
+ while (--tries) {
val = efx_mdio_read(efx, mmd, TXC_GLRGS_GLCMD);
if (!(val & (1 << TXC_GLCMD_LMTSWRST_LBN)))
break;
if (ret)
return ret;
- return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+ ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
+ if (ret)
+ sun7i_gmac_exit(pdev, plat_dat->bsp_priv);
+
+ return ret;
}
static const struct of_device_id sun7i_dwmac_match[] = {
priv->hw->dma->stop_tx(priv->ioaddr);
priv->hw->dma->stop_rx(priv->ioaddr);
- stmmac_clear_descriptors(priv);
-
/* Enable Power down mode by programming the PMT regs */
if (device_may_wakeup(priv->device)) {
priv->hw->mac->pmt(priv->hw, priv->wolopts);
netif_device_attach(ndev);
- init_dma_desc_rings(ndev, GFP_ATOMIC);
+ priv->cur_rx = 0;
+ priv->dirty_rx = 0;
+ priv->dirty_tx = 0;
+ priv->cur_tx = 0;
+ stmmac_clear_descriptors(priv);
+
stmmac_hw_setup(ndev, false);
stmmac_init_tx_coalesce(priv);
stmmac_set_rx_mode(ndev);
for_each_child_of_node(node, slave_node) {
struct cpsw_slave_data *slave_data = data->slave_data + i;
const void *mac_addr = NULL;
- u32 phyid;
int lenp;
const __be32 *parp;
- struct device_node *mdio_node;
- struct platform_device *mdio;
/* This is no slave child node, continue */
if (strcmp(slave_node->name, "slave"))
continue;
priv->phy_node = of_parse_phandle(slave_node, "phy-handle", 0);
+ parp = of_get_property(slave_node, "phy_id", &lenp);
if (of_phy_is_fixed_link(slave_node)) {
- struct phy_device *pd;
+ struct device_node *phy_node;
+ struct phy_device *phy_dev;
+ /* In the case of a fixed PHY, the DT node associated
+ * to the PHY is the Ethernet MAC DT node.
+ */
ret = of_phy_register_fixed_link(slave_node);
if (ret)
return ret;
- pd = of_phy_find_device(slave_node);
- if (!pd)
+ phy_node = of_node_get(slave_node);
+ phy_dev = of_phy_find_device(phy_node);
+ if (!phy_dev)
return -ENODEV;
snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
- PHY_ID_FMT, pd->bus->id, pd->phy_id);
- goto no_phy_slave;
- }
- parp = of_get_property(slave_node, "phy_id", &lenp);
- if ((parp == NULL) || (lenp != (sizeof(void *) * 2))) {
- dev_err(&pdev->dev, "Missing slave[%d] phy_id property\n", i);
+ PHY_ID_FMT, phy_dev->bus->id, phy_dev->addr);
+ } else if (parp) {
+ u32 phyid;
+ struct device_node *mdio_node;
+ struct platform_device *mdio;
+
+ if (lenp != (sizeof(__be32) * 2)) {
+ dev_err(&pdev->dev, "Invalid slave[%d] phy_id property\n", i);
+ goto no_phy_slave;
+ }
+ mdio_node = of_find_node_by_phandle(be32_to_cpup(parp));
+ phyid = be32_to_cpup(parp+1);
+ mdio = of_find_device_by_node(mdio_node);
+ of_node_put(mdio_node);
+ if (!mdio) {
+ dev_err(&pdev->dev, "Missing mdio platform device\n");
+ return -EINVAL;
+ }
+ snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
+ PHY_ID_FMT, mdio->name, phyid);
+ } else {
+ dev_err(&pdev->dev, "No slave[%d] phy_id or fixed-link property\n", i);
goto no_phy_slave;
}
- mdio_node = of_find_node_by_phandle(be32_to_cpup(parp));
- phyid = be32_to_cpup(parp+1);
- mdio = of_find_device_by_node(mdio_node);
- of_node_put(mdio_node);
- if (!mdio) {
- dev_err(&pdev->dev, "Missing mdio platform device\n");
- return -EINVAL;
- }
- snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
- PHY_ID_FMT, mdio->name, phyid);
slave_data->phy_if = of_get_phy_mode(slave_node);
if (slave_data->phy_if < 0) {
dev_err(&pdev->dev, "Missing or malformed slave[%d] phy-mode property\n",
ndev->irq = platform_get_irq(pdev, 1);
if (ndev->irq < 0) {
dev_err(priv->dev, "error getting irq resource\n");
- ret = -ENOENT;
+ ret = ndev->irq;
goto clean_ale_ret;
}
/* RX IRQ */
irq = platform_get_irq(pdev, 1);
- if (irq < 0)
+ if (irq < 0) {
+ ret = irq;
goto clean_ale_ret;
+ }
priv->irqs_table[0] = irq;
ret = devm_request_irq(&pdev->dev, irq, cpsw_rx_interrupt,
/* TX IRQ */
irq = platform_get_irq(pdev, 2);
- if (irq < 0)
+ if (irq < 0) {
+ ret = irq;
goto clean_ale_ret;
+ }
priv->irqs_table[1] = irq;
ret = devm_request_irq(&pdev->dev, irq, cpsw_tx_interrupt,
err = udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
&fl6.saddr, &fl6.daddr, prio, ttl,
sport, geneve->dst_port, !udp_csum);
-
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return NETDEV_TX_OK;
tx_error:
struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_dev *t, *geneve = netdev_priv(dev);
bool tun_collect_md, tun_on_same_port;
- int err;
+ int err, encap_len;
if (!remote)
return -EINVAL;
if (t)
return -EBUSY;
+ /* make enough headroom for basic scenario */
+ encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
+ if (remote->sa.sa_family == AF_INET)
+ encap_len += sizeof(struct iphdr);
+ else
+ encap_len += sizeof(struct ipv6hdr);
+ dev->needed_headroom = encap_len + ETH_HLEN;
+
if (metadata) {
if (tun_on_same_port)
return -EPERM;
if (!atomic_dec_and_test(&sp->refcnt))
down(&sp->dead_sem);
- unregister_netdev(sp->dev);
+ /* We must stop the queue to avoid potentially scribbling
+ * on the free buffers. The sp->dead_sem is not sufficient
+ * to protect us from sp->xbuff access.
+ */
+ netif_stop_queue(sp->dev);
- del_timer(&sp->tx_t);
- del_timer(&sp->resync_t);
+ del_timer_sync(&sp->tx_t);
+ del_timer_sync(&sp->resync_t);
/* Free all 6pack frame buffers. */
kfree(sp->rbuff);
kfree(sp->xbuff);
+
+ unregister_netdev(sp->dev);
}
/* Perform I/O control on an active 6pack channel. */
*/
if (!atomic_dec_and_test(&ax->refcnt))
down(&ax->dead_sem);
-
- unregister_netdev(ax->dev);
+ /*
+ * Halt the transmit queue so that a new transmit cannot scribble
+ * on our buffers
+ */
+ netif_stop_queue(ax->dev);
/* Free all AX25 frame buffers. */
kfree(ax->rbuff);
kfree(ax->xbuff);
ax->tty = NULL;
+
+ unregister_netdev(ax->dev);
}
/* Perform I/O control on an active ax25 channel. */
}
cb->bus_number = v;
cb->parent = pb;
+
cb->mii_bus = mdiobus_alloc();
+ if (!cb->mii_bus) {
+ ret_val = -ENOMEM;
+ of_node_put(child_bus_node);
+ break;
+ }
cb->mii_bus->priv = cb;
-
cb->mii_bus->irq = cb->phy_irq;
cb->mii_bus->name = "mdio_mux";
snprintf(cb->mii_bus->id, MII_BUS_ID_SIZE, "%x.%x",
{
const struct device *dev = &phydev->dev;
const struct device_node *of_node = dev->of_node;
+ const struct device *dev_walker;
- if (!of_node && dev->parent->of_node)
- of_node = dev->parent->of_node;
+ /* The Micrel driver has a deprecated option to place phy OF
+ * properties in the MAC node. Walk up the tree of devices to
+ * find a device with an OF node.
+ */
+ dev_walker = &phydev->dev;
+ do {
+ of_node = dev_walker->of_node;
+ dev_walker = dev_walker->parent;
+
+ } while (!of_node && dev_walker);
if (of_node) {
ksz9021_load_values_from_of(phydev, of_node,
sk->sk_family = PF_PPPOX;
sk->sk_protocol = PX_PROTO_OE;
+ INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work,
+ pppoe_unbind_sock_work);
+
return 0;
}
lock_sock(sk);
- INIT_WORK(&po->proto.pppoe.padt_work, pppoe_unbind_sock_work);
-
error = -EINVAL;
if (sp->sa_protocol != PX_PROTO_OE)
goto end;
po->pppoe_dev = NULL;
}
- memset(sk_pppox(po) + 1, 0,
- sizeof(struct pppox_sock) - sizeof(struct sock));
+ po->pppoe_ifindex = 0;
+ memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa));
+ memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay));
+ memset(&po->chan, 0, sizeof(po->chan));
+ po->next = NULL;
+ po->num = 0;
+
sk->sk_state = PPPOX_NONE;
}
struct pptp_opt *opt = &po->proto.pptp;
int error = 0;
+ if (sockaddr_len < sizeof(struct sockaddr_pppox))
+ return -EINVAL;
+
lock_sock(sk);
opt->src_addr = sp->sa_addr.pptp;
struct flowi4 fl4;
int error = 0;
+ if (sockaddr_len < sizeof(struct sockaddr_pppox))
+ return -EINVAL;
+
if (sp->sa_protocol != PX_PROTO_PPTP)
return -EINVAL;
.ndo_stop = usbnet_stop,
.ndo_start_xmit = usbnet_start_xmit,
.ndo_tx_timeout = usbnet_tx_timeout,
- .ndo_change_mtu = usbnet_change_mtu,
+ .ndo_change_mtu = cdc_ncm_change_mtu,
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_vlan_rx_add_vid = cdc_mbim_rx_add_vid,
if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
goto err;
- ret = cdc_ncm_bind_common(dev, intf, data_altsetting, 0);
+ ret = cdc_ncm_bind_common(dev, intf, data_altsetting, dev->driver_info->data);
if (ret)
goto err;
.tx_fixup = cdc_mbim_tx_fixup,
};
+/* The spefication explicitly allows NDPs to be placed anywhere in the
+ * frame, but some devices fail unless the NDP is placed after the IP
+ * packets. Using the CDC_NCM_FLAG_NDP_TO_END flags to force this
+ * behaviour.
+ *
+ * Note: The current implementation of this feature restricts each NTB
+ * to a single NDP, implying that multiplexed sessions cannot share an
+ * NTB. This might affect performace for multiplexed sessions.
+ */
+static const struct driver_info cdc_mbim_info_ndp_to_end = {
+ .description = "CDC MBIM",
+ .flags = FLAG_NO_SETINT | FLAG_MULTI_PACKET | FLAG_WWAN,
+ .bind = cdc_mbim_bind,
+ .unbind = cdc_mbim_unbind,
+ .manage_power = cdc_mbim_manage_power,
+ .rx_fixup = cdc_mbim_rx_fixup,
+ .tx_fixup = cdc_mbim_tx_fixup,
+ .data = CDC_NCM_FLAG_NDP_TO_END,
+};
+
static const struct usb_device_id mbim_devs[] = {
/* This duplicate NCM entry is intentional. MBIM devices can
* be disguised as NCM by default, and this is necessary to
{ USB_VENDOR_AND_INTERFACE_INFO(0x0bdb, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE),
.driver_info = (unsigned long)&cdc_mbim_info,
},
+ /* Huawei E3372 fails unless NDP comes after the IP packets */
+ { USB_DEVICE_AND_INTERFACE_INFO(0x12d1, 0x157d, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE),
+ .driver_info = (unsigned long)&cdc_mbim_info_ndp_to_end,
+ },
/* default entry */
{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE),
.driver_info = (unsigned long)&cdc_mbim_info_zlp,
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/ctype.h>
+#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/workqueue.h>
#include <linux/mii.h>
kfree(ctx);
}
+/* we need to override the usbnet change_mtu ndo for two reasons:
+ * - respect the negotiated maximum datagram size
+ * - avoid unwanted changes to rx and tx buffers
+ */
+int cdc_ncm_change_mtu(struct net_device *net, int new_mtu)
+{
+ struct usbnet *dev = netdev_priv(net);
+ struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+ int maxmtu = ctx->max_datagram_size - cdc_ncm_eth_hlen(dev);
+
+ if (new_mtu <= 0 || new_mtu > maxmtu)
+ return -EINVAL;
+ net->mtu = new_mtu;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cdc_ncm_change_mtu);
+
+static const struct net_device_ops cdc_ncm_netdev_ops = {
+ .ndo_open = usbnet_open,
+ .ndo_stop = usbnet_stop,
+ .ndo_start_xmit = usbnet_start_xmit,
+ .ndo_tx_timeout = usbnet_tx_timeout,
+ .ndo_change_mtu = cdc_ncm_change_mtu,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+};
+
int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags)
{
struct cdc_ncm_ctx *ctx;
/* add our sysfs attrs */
dev->net->sysfs_groups[0] = &cdc_ncm_sysfs_attr_group;
+ /* must handle MTU changes */
+ dev->net->netdev_ops = &cdc_ncm_netdev_ops;
+
return 0;
error2:
* NTH16 header as we would normally do. NDP isn't written to the SKB yet, and
* the wNdpIndex field in the header is actually not consistent with reality. It will be later.
*/
- if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
if (ctx->delayed_ndp16->dwSignature == sign)
return ctx->delayed_ndp16;
+ /* We can only push a single NDP to the end. Return
+ * NULL to send what we've already got and queue this
+ * skb for later.
+ */
+ else if (ctx->delayed_ndp16->dwSignature)
+ return NULL;
+ }
+
/* follow the chain of NDPs, looking for a match */
while (ndpoffset) {
ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb->data + ndpoffset);
.driver_info = (unsigned long) &wwan_info,
},
+ /* DW5812 LTE Verizon Mobile Broadband Card
+ * Unlike DW5550 this device requires FLAG_NOARP
+ */
+ { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bb,
+ USB_CLASS_COMM,
+ USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
+ .driver_info = (unsigned long)&wwan_noarp_info,
+ },
+
+ /* DW5813 LTE AT&T Mobile Broadband Card
+ * Unlike DW5550 this device requires FLAG_NOARP
+ */
+ { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bc,
+ USB_CLASS_COMM,
+ USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE),
+ .driver_info = (unsigned long)&wwan_noarp_info,
+ },
+
/* Dell branded MBM devices like DW5550 */
{ .match_flags = USB_DEVICE_ID_MATCH_INT_INFO
| USB_DEVICE_ID_MATCH_VENDOR,
{QMI_FIXED_INTF(0x413c, 0x81a9, 8)}, /* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
{QMI_FIXED_INTF(0x413c, 0x81b1, 8)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
+ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */
/* 4. Gobi 1000 devices */
{QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */
mutex_lock(&tp->control);
- /* The WORK_ENABLE may be set when autoresume occurs */
- if (test_bit(WORK_ENABLE, &tp->flags)) {
- clear_bit(WORK_ENABLE, &tp->flags);
- usb_kill_urb(tp->intr_urb);
- cancel_delayed_work_sync(&tp->schedule);
-
- /* disable the tx/rx, if the workqueue has enabled them. */
- if (netif_carrier_ok(netdev))
- tp->rtl_ops.disable(tp);
- }
-
tp->rtl_ops.up(tp);
rtl8152_set_speed(tp, AUTONEG_ENABLE,
} else {
mutex_lock(&tp->control);
- /* The autosuspend may have been enabled and wouldn't
- * be disable when autoresume occurs, because the
- * netif_running() would be false.
- */
- rtl_runtime_suspend_enable(tp, false);
-
tp->rtl_ops.down(tp);
mutex_unlock(&tp->control);
netif_device_attach(tp->netdev);
}
- if (netif_running(tp->netdev)) {
+ if (netif_running(tp->netdev) && tp->netdev->flags & IFF_UP) {
if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) {
rtl_runtime_suspend_enable(tp, false);
clear_bit(SELECTIVE_SUSPEND, &tp->flags);
}
usb_submit_urb(tp->intr_urb, GFP_KERNEL);
} else if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) {
+ if (tp->netdev->flags & IFF_UP)
+ rtl_runtime_suspend_enable(tp, false);
clear_bit(SELECTIVE_SUSPEND, &tp->flags);
}
return 0;
}
+static int rtl8152_reset_resume(struct usb_interface *intf)
+{
+ struct r8152 *tp = usb_get_intfdata(intf);
+
+ clear_bit(SELECTIVE_SUSPEND, &tp->flags);
+ return rtl8152_resume(intf);
+}
+
static void rtl8152_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
{
struct r8152 *tp = netdev_priv(dev);
.disconnect = rtl8152_disconnect,
.suspend = rtl8152_suspend,
.resume = rtl8152_resume,
- .reset_resume = rtl8152_resume,
+ .reset_resume = rtl8152_reset_resume,
.pre_reset = rtl8152_pre_reset,
.post_reset = rtl8152_post_reset,
.supports_autosuspend = 1,
kfree_skb(skb);
goto drop;
}
- /* don't change ip_summed == CHECKSUM_PARTIAL, as that
- * will cause bad checksum on forwarded packets
- */
- if (skb->ip_summed == CHECKSUM_NONE &&
- rcv->features & NETIF_F_RXCSUM)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
skip_page_frags = true;
goto rcd_done;
}
- new_dma_addr = dma_map_page(&adapter->pdev->dev
- , rbi->page,
- 0, PAGE_SIZE,
- PCI_DMA_FROMDEVICE);
+ new_dma_addr = dma_map_page(&adapter->pdev->dev,
+ new_page,
+ 0, PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
if (dma_mapping_error(&adapter->pdev->dev,
new_dma_addr)) {
put_page(new_page);
/*
* Version numbers
*/
-#define VMXNET3_DRIVER_VERSION_STRING "1.4.4.0-k"
+#define VMXNET3_DRIVER_VERSION_STRING "1.4.5.0-k"
/* a 32-bit int, each byte encode a verion number in VMXNET3_DRIVER_VERSION */
-#define VMXNET3_DRIVER_VERSION_NUM 0x01040400
+#define VMXNET3_DRIVER_VERSION_NUM 0x01040500
#if defined(CONFIG_PCI_MSI)
/* RSS only makes sense if MSI-X is supported. */
}
/* called under rcu_read_lock */
-static void vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
+static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
{
struct fib_result res = { .tclassid = 0 };
struct net *net = dev_net(dev);
u8 flags = fl4->flowi4_flags;
u8 scope = fl4->flowi4_scope;
u8 tos = RT_FL_TOS(fl4);
+ int rc;
if (unlikely(!fl4->daddr))
- return;
+ return 0;
fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
- if (!fib_lookup(net, fl4, &res, 0)) {
+ rc = fib_lookup(net, fl4, &res, 0);
+ if (!rc) {
if (res.type == RTN_LOCAL)
fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr;
else
fl4->flowi4_flags = flags;
fl4->flowi4_tos = orig_tos;
fl4->flowi4_scope = scope;
+
+ return rc;
}
#if IS_ENABLED(CONFIG_IPV6)
struct pcpu_sw_netstats *stats;
union vxlan_addr saddr;
int err = 0;
- union vxlan_addr *remote_ip;
/* For flow based devices, map all packets to VNI 0 */
if (vs->flags & VXLAN_F_COLLECT_METADATA)
if (!vxlan)
goto drop;
- remote_ip = &vxlan->default_dst.remote_ip;
skb_reset_mac_header(skb);
skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
skb->protocol = eth_type_trans(skb, vxlan->dev);
if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
goto drop;
- /* Re-examine inner Ethernet packet */
- if (remote_ip->sa.sa_family == AF_INET) {
+ /* Get data from the outer IP header */
+ if (vxlan_get_sk_family(vs) == AF_INET) {
oip = ip_hdr(skb);
saddr.sin.sin_addr.s_addr = oip->saddr;
saddr.sa.sa_family = AF_INET;
!(vxflags & VXLAN_F_UDP_CSUM));
}
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
+ struct sk_buff *skb, int oif,
+ const struct in6_addr *daddr,
+ struct in6_addr *saddr)
+{
+ struct dst_entry *ndst;
+ struct flowi6 fl6;
+ int err;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = oif;
+ fl6.daddr = *daddr;
+ fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = IPPROTO_UDP;
+
+ err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
+ vxlan->vn6_sock->sock->sk,
+ &ndst, &fl6);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ *saddr = fl6.saddr;
+ return ndst;
+}
+#endif
+
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
struct vxlan_dev *dst_vxlan)
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct dst_entry *ndst;
- struct flowi6 fl6;
+ struct in6_addr saddr;
u32 rt6i_flags;
if (!vxlan->vn6_sock)
goto drop;
sk = vxlan->vn6_sock->sock->sk;
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
- fl6.daddr = dst->sin6.sin6_addr;
- fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
- fl6.flowi6_mark = skb->mark;
- fl6.flowi6_proto = IPPROTO_UDP;
-
- if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
+ ndst = vxlan6_get_route(vxlan, skb,
+ rdst ? rdst->remote_ifindex : 0,
+ &dst->sin6.sin6_addr, &saddr);
+ if (IS_ERR(ndst)) {
netdev_dbg(dev, "no route to %pI6\n",
&dst->sin6.sin6_addr);
dev->stats.tx_carrier_errors++;
}
ttl = ttl ? : ip6_dst_hoplimit(ndst);
- err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
+ err = vxlan6_xmit_skb(ndst, sk, skb, dev, &saddr, &dst->sin6.sin6_addr,
0, ttl, src_port, dst_port, htonl(vni << 8), md,
!net_eq(vxlan->net, dev_net(vxlan->dev)),
flags);
vxlan->cfg.port_max, true);
dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
- if (ip_tunnel_info_af(info) == AF_INET)
+ if (ip_tunnel_info_af(info) == AF_INET) {
+ if (!vxlan->vn4_sock)
+ return -EINVAL;
return egress_ipv4_tun_info(dev, skb, info, sport, dport);
- return -EINVAL;
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ struct dst_entry *ndst;
+
+ if (!vxlan->vn6_sock)
+ return -EINVAL;
+ ndst = vxlan6_get_route(vxlan, skb, 0,
+ &info->key.u.ipv6.dst,
+ &info->key.u.ipv6.src);
+ if (IS_ERR(ndst))
+ return PTR_ERR(ndst);
+ dst_release(ndst);
+
+ info->key.tp_src = sport;
+ info->key.tp_dst = dport;
+#else /* !CONFIG_IPV6 */
+ return -EPFNOSUPPORT;
+#endif
+ }
+ return 0;
}
static const struct net_device_ops vxlan_netdev_ops = {
#include "iwl-agn-hw.h"
/* Highest firmware API version supported */
-#define IWL7260_UCODE_API_MAX 19
+#define IWL7260_UCODE_API_MAX 17
+#define IWL7265_UCODE_API_MAX 19
+#define IWL7265D_UCODE_API_MAX 19
/* Oldest version we won't warn about */
#define IWL7260_UCODE_API_OK 13
+#define IWL7265_UCODE_API_OK 13
+#define IWL7265D_UCODE_API_OK 13
/* Lowest firmware API version supported */
#define IWL7260_UCODE_API_MIN 13
+#define IWL7265_UCODE_API_MIN 13
+#define IWL7265D_UCODE_API_MIN 13
/* NVM versions */
#define IWL7260_NVM_VERSION 0x0a1d
.ht40_bands = BIT(IEEE80211_BAND_2GHZ) | BIT(IEEE80211_BAND_5GHZ),
};
-#define IWL_DEVICE_7000 \
- .ucode_api_max = IWL7260_UCODE_API_MAX, \
- .ucode_api_ok = IWL7260_UCODE_API_OK, \
- .ucode_api_min = IWL7260_UCODE_API_MIN, \
+#define IWL_DEVICE_7000_COMMON \
.device_family = IWL_DEVICE_FAMILY_7000, \
.max_inst_size = IWL60_RTC_INST_SIZE, \
.max_data_size = IWL60_RTC_DATA_SIZE, \
.max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K, \
.dccm_offset = IWL7000_DCCM_OFFSET
+#define IWL_DEVICE_7000 \
+ IWL_DEVICE_7000_COMMON, \
+ .ucode_api_max = IWL7260_UCODE_API_MAX, \
+ .ucode_api_ok = IWL7260_UCODE_API_OK, \
+ .ucode_api_min = IWL7260_UCODE_API_MIN
+
+#define IWL_DEVICE_7005 \
+ IWL_DEVICE_7000_COMMON, \
+ .ucode_api_max = IWL7265_UCODE_API_MAX, \
+ .ucode_api_ok = IWL7265_UCODE_API_OK, \
+ .ucode_api_min = IWL7265_UCODE_API_MIN
+
+#define IWL_DEVICE_7005D \
+ IWL_DEVICE_7000_COMMON, \
+ .ucode_api_max = IWL7265D_UCODE_API_MAX, \
+ .ucode_api_ok = IWL7265D_UCODE_API_OK, \
+ .ucode_api_min = IWL7265D_UCODE_API_MIN
+
const struct iwl_cfg iwl7260_2ac_cfg = {
.name = "Intel(R) Dual Band Wireless AC 7260",
.fw_name_pre = IWL7260_FW_PRE,
const struct iwl_cfg iwl3165_2ac_cfg = {
.name = "Intel(R) Dual Band Wireless AC 3165",
.fw_name_pre = IWL7265D_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005D,
.ht_params = &iwl7000_ht_params,
.nvm_ver = IWL3165_NVM_VERSION,
.nvm_calib_ver = IWL3165_TX_POWER_VERSION,
const struct iwl_cfg iwl7265_2ac_cfg = {
.name = "Intel(R) Dual Band Wireless AC 7265",
.fw_name_pre = IWL7265_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
const struct iwl_cfg iwl7265_2n_cfg = {
.name = "Intel(R) Dual Band Wireless N 7265",
.fw_name_pre = IWL7265_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
const struct iwl_cfg iwl7265_n_cfg = {
.name = "Intel(R) Wireless N 7265",
.fw_name_pre = IWL7265_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
const struct iwl_cfg iwl7265d_2ac_cfg = {
.name = "Intel(R) Dual Band Wireless AC 7265",
.fw_name_pre = IWL7265D_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005D,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265D_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
const struct iwl_cfg iwl7265d_2n_cfg = {
.name = "Intel(R) Dual Band Wireless N 7265",
.fw_name_pre = IWL7265D_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005D,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265D_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
const struct iwl_cfg iwl7265d_n_cfg = {
.name = "Intel(R) Wireless N 7265",
.fw_name_pre = IWL7265D_FW_PRE,
- IWL_DEVICE_7000,
+ IWL_DEVICE_7005D,
.ht_params = &iwl7265_ht_params,
.nvm_ver = IWL7265D_NVM_VERSION,
.nvm_calib_ver = IWL7265_TX_POWER_VERSION,
MODULE_FIRMWARE(IWL7260_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
MODULE_FIRMWARE(IWL3160_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
-MODULE_FIRMWARE(IWL7265_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
-MODULE_FIRMWARE(IWL7265D_MODULE_FIRMWARE(IWL7260_UCODE_API_OK));
+MODULE_FIRMWARE(IWL7265_MODULE_FIRMWARE(IWL7265_UCODE_API_OK));
+MODULE_FIRMWARE(IWL7265D_MODULE_FIRMWARE(IWL7265D_UCODE_API_OK));
mvmvif->ap_sta_id != IWL_MVM_STATION_COUNT) {
u8 sta_id = mvmvif->ap_sta_id;
- sta = rcu_dereference_protected(mvm->fw_id_to_mac_id[sta_id],
- lockdep_is_held(&mvm->mutex));
+ sta = rcu_dereference_check(mvm->fw_id_to_mac_id[sta_id],
+ lockdep_is_held(&mvm->mutex));
/*
* It is possible that the 'sta' parameter is NULL,
* for example when a GTK is removed - the sta_id will then
u16 *phase1key)
{
struct iwl_mvm_sta *mvm_sta;
- u8 sta_id = iwl_mvm_get_key_sta_id(mvm, vif, sta);
+ u8 sta_id;
bool mcast = !(keyconf->flags & IEEE80211_KEY_FLAG_PAIRWISE);
- if (WARN_ON_ONCE(sta_id == IWL_MVM_STATION_COUNT))
- return;
-
rcu_read_lock();
+ sta_id = iwl_mvm_get_key_sta_id(mvm, vif, sta);
+ if (WARN_ON_ONCE(sta_id == IWL_MVM_STATION_COUNT))
+ goto unlock;
+
if (!sta) {
sta = rcu_dereference(mvm->fw_id_to_mac_id[sta_id]);
if (WARN_ON(IS_ERR_OR_NULL(sta))) {
mvm_sta = iwl_mvm_sta_from_mac80211(sta);
iwl_mvm_send_sta_key(mvm, mvm_sta, keyconf, mcast,
iv32, phase1key, CMD_ASYNC, keyconf->hw_key_idx);
+
+ unlock:
rcu_read_unlock();
}
struct netrx_pending_operations *npo)
{
struct xenvif_rx_meta *meta;
- struct xen_netif_rx_request *req;
+ struct xen_netif_rx_request req;
- req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+ RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
meta = npo->meta + npo->meta_prod++;
meta->gso_type = XEN_NETIF_GSO_TYPE_NONE;
meta->gso_size = 0;
meta->size = 0;
- meta->id = req->id;
+ meta->id = req.id;
npo->copy_off = 0;
- npo->copy_gref = req->gref;
+ npo->copy_gref = req.gref;
return meta;
}
struct xenvif *vif = netdev_priv(skb->dev);
int nr_frags = skb_shinfo(skb)->nr_frags;
int i;
- struct xen_netif_rx_request *req;
+ struct xen_netif_rx_request req;
struct xenvif_rx_meta *meta;
unsigned char *data;
int head = 1;
/* Set up a GSO prefix descriptor, if necessary */
if ((1 << gso_type) & vif->gso_prefix_mask) {
- req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+ RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
meta = npo->meta + npo->meta_prod++;
meta->gso_type = gso_type;
meta->gso_size = skb_shinfo(skb)->gso_size;
meta->size = 0;
- meta->id = req->id;
+ meta->id = req.id;
}
- req = RING_GET_REQUEST(&queue->rx, queue->rx.req_cons++);
+ RING_COPY_REQUEST(&queue->rx, queue->rx.req_cons++, &req);
meta = npo->meta + npo->meta_prod++;
if ((1 << gso_type) & vif->gso_mask) {
}
meta->size = 0;
- meta->id = req->id;
+ meta->id = req.id;
npo->copy_off = 0;
- npo->copy_gref = req->gref;
+ npo->copy_gref = req.gref;
data = skb->data;
while (data < skb_tail_pointer(skb)) {
* Allow a burst big enough to transmit a jumbo packet of up to 128kB.
* Otherwise the interface can seize up due to insufficient credit.
*/
- max_burst = RING_GET_REQUEST(&queue->tx, queue->tx.req_cons)->size;
- max_burst = min(max_burst, 131072UL);
- max_burst = max(max_burst, queue->credit_bytes);
+ max_burst = max(131072UL, queue->credit_bytes);
/* Take care that adding a new chunk of credit doesn't wrap to zero. */
max_credit = queue->remaining_credit + queue->credit_bytes;
spin_unlock_irqrestore(&queue->response_lock, flags);
if (cons == end)
break;
- txp = RING_GET_REQUEST(&queue->tx, cons++);
+ RING_COPY_REQUEST(&queue->tx, cons++, txp);
} while (1);
queue->tx.req_cons = cons;
}
if (drop_err)
txp = &dropped_tx;
- memcpy(txp, RING_GET_REQUEST(&queue->tx, cons + slots),
- sizeof(*txp));
+ RING_COPY_REQUEST(&queue->tx, cons + slots, txp);
/* If the guest submitted a frame >= 64 KiB then
* first->size overflowed and following slots will
return -EBADR;
}
- memcpy(&extra, RING_GET_REQUEST(&queue->tx, cons),
- sizeof(extra));
+ RING_COPY_REQUEST(&queue->tx, cons, &extra);
if (unlikely(!extra.type ||
extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
queue->tx.req_cons = ++cons;
idx = queue->tx.req_cons;
rmb(); /* Ensure that we see the request before we copy it. */
- memcpy(&txreq, RING_GET_REQUEST(&queue->tx, idx), sizeof(txreq));
+ RING_COPY_REQUEST(&queue->tx, idx, &txreq);
/* Credit-based scheduling. */
if (txreq.size > queue->remaining_credit &&
{
bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue);
- if (kill)
+ if (kill) {
blk_set_queue_dying(ns->queue);
+
+ /*
+ * The controller was shutdown first if we got here through
+ * device removal. The shutdown may requeue outstanding
+ * requests. These need to be aborted immediately so
+ * del_gendisk doesn't block indefinitely for their completion.
+ */
+ blk_mq_abort_requeue_list(ns->queue);
+ }
if (ns->disk->flags & GENHD_FL_UP)
del_gendisk(ns->disk);
if (kill || !blk_queue_dying(ns->queue)) {
{
struct nvme_ns *ns, *next;
+ if (nvme_io_incapable(dev)) {
+ /*
+ * If the device is not capable of IO (surprise hot-removal,
+ * for example), we need to quiesce prior to deleting the
+ * namespaces. This will end outstanding requests and prevent
+ * attempts to sync dirty data.
+ */
+ nvme_dev_shutdown(dev);
+ }
list_for_each_entry_safe(ns, next, &dev->namespaces, list)
nvme_ns_remove(ns);
}
bool "TI DRA7xx PCIe controller"
select PCIE_DW
depends on OF && HAS_IOMEM && TI_PIPE3
+ depends on BROKEN
help
Enables support for the PCIe controller in the DRA7xx SoC. There
are two instances of PCIe controller in DRA7xx. This controller can
*val = *(u8 __force *) walker;
else if (size == 2)
*val = *(u16 __force *) walker;
- else if (size != 4)
+ else if (size == 4)
+ *val = reg_val;
+ else
return PCIBIOS_BAD_REGISTER_NUMBER;
return PCIBIOS_SUCCESSFUL;
tristate "Allwinner sun9i SoC USB PHY driver"
depends on ARCH_SUNXI && HAS_IOMEM && OF
depends on RESET_CONTROLLER
+ depends on USB_COMMON
select GENERIC_PHY
help
Enable this to support the transceiver that is part of Allwinner
struct phy_provider *provider;
struct resource *res;
unsigned cnt = 0;
+ int ret;
if (of_get_child_count(node) == 0) {
dev_err(dev, "PHY no child node\n");
if (of_property_read_u32(child, "reg", &id)) {
dev_err(dev, "missing reg property for %s\n",
child->name);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
if (id >= MAX_NUM_PHYS) {
dev_err(dev, "invalid PHY id: %u\n", id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
if (core->phys[id].phy) {
dev_err(dev, "duplicated PHY id: %u\n", id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
p = &core->phys[id];
p->phy = devm_phy_create(dev, child, &cygnus_pcie_phy_ops);
if (IS_ERR(p->phy)) {
dev_err(dev, "failed to create PHY\n");
- return PTR_ERR(p->phy);
+ ret = PTR_ERR(p->phy);
+ goto put_child;
}
p->core = core;
dev_dbg(dev, "registered %u PCIe PHY(s)\n", cnt);
return 0;
+put_child:
+ of_node_put(child);
+ return ret;
}
static const struct of_device_id cygnus_pcie_phy_match_table[] = {
struct phy_provider *phy_provider;
struct phy_berlin_priv *priv;
struct resource *res;
- int i = 0;
+ int ret, i = 0;
u32 phy_id;
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (of_property_read_u32(child, "reg", &phy_id)) {
dev_err(dev, "missing reg property in node %s\n",
child->name);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
if (phy_id >= ARRAY_SIZE(phy_berlin_power_down_bits)) {
dev_err(dev, "invalid reg in node %s\n", child->name);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
phy_desc = devm_kzalloc(dev, sizeof(*phy_desc), GFP_KERNEL);
- if (!phy_desc)
- return -ENOMEM;
+ if (!phy_desc) {
+ ret = -ENOMEM;
+ goto put_child;
+ }
phy = devm_phy_create(dev, NULL, &phy_berlin_sata_ops);
if (IS_ERR(phy)) {
dev_err(dev, "failed to create PHY %d\n", phy_id);
- return PTR_ERR(phy);
+ ret = PTR_ERR(phy);
+ goto put_child;
}
phy_desc->phy = phy;
phy_provider =
devm_of_phy_provider_register(dev, phy_berlin_sata_phy_xlate);
return PTR_ERR_OR_ZERO(phy_provider);
+put_child:
+ of_node_put(child);
+ return ret;
}
static const struct of_device_id phy_berlin_sata_of_match[] = {
struct brcm_sata_phy *priv;
struct resource *res;
struct phy_provider *provider;
- int count = 0;
+ int ret, count = 0;
if (of_get_child_count(dn) == 0)
return -ENODEV;
if (of_property_read_u32(child, "reg", &id)) {
dev_err(dev, "missing reg property in node %s\n",
child->name);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
if (id >= MAX_PORTS) {
dev_err(dev, "invalid reg: %u\n", id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
if (priv->phys[id].phy) {
dev_err(dev, "already registered port %u\n", id);
- return -EINVAL;
+ ret = -EINVAL;
+ goto put_child;
}
port = &priv->phys[id];
port->ssc_en = of_property_read_bool(child, "brcm,enable-ssc");
if (IS_ERR(port->phy)) {
dev_err(dev, "failed to create PHY\n");
- return PTR_ERR(port->phy);
+ ret = PTR_ERR(port->phy);
+ goto put_child;
}
phy_set_drvdata(port->phy, port);
dev_info(dev, "registered %d port(s)\n", count);
return 0;
+put_child:
+ of_node_put(child);
+ return ret;
}
static struct platform_driver brcm_sata_phy_driver = {
* @np: node containing the phy
* @index: index of the phy
*
- * Gets the phy using _of_phy_get(), and associates a device with it using
- * devres. On driver detach, release function is invoked on the devres data,
+ * Gets the phy using _of_phy_get(), then gets a refcount to it,
+ * and associates a device with it using devres. On driver detach,
+ * release function is invoked on the devres data,
* then, devres data is freed.
*
*/
return ERR_PTR(-ENOMEM);
phy = _of_phy_get(np, index);
- if (!IS_ERR(phy)) {
- *ptr = phy;
- devres_add(dev, ptr);
- } else {
+ if (IS_ERR(phy)) {
devres_free(ptr);
+ return phy;
}
+ if (!try_module_get(phy->ops->owner)) {
+ devres_free(ptr);
+ return ERR_PTR(-EPROBE_DEFER);
+ }
+
+ get_device(&phy->dev);
+
+ *ptr = phy;
+ devres_add(dev, ptr);
+
return phy;
}
EXPORT_SYMBOL_GPL(devm_of_phy_get_by_index);
miphy_phy = devm_kzalloc(&pdev->dev, sizeof(*miphy_phy),
GFP_KERNEL);
- if (!miphy_phy)
- return -ENOMEM;
+ if (!miphy_phy) {
+ ret = -ENOMEM;
+ goto put_child;
+ }
miphy_dev->phys[port] = miphy_phy;
phy = devm_phy_create(&pdev->dev, child, &miphy28lp_ops);
if (IS_ERR(phy)) {
dev_err(&pdev->dev, "failed to create PHY\n");
- return PTR_ERR(phy);
+ ret = PTR_ERR(phy);
+ goto put_child;
}
miphy_dev->phys[port]->phy = phy;
ret = miphy28lp_of_probe(child, miphy_phy);
if (ret)
- return ret;
+ goto put_child;
ret = miphy28lp_probe_resets(child, miphy_dev->phys[port]);
if (ret)
- return ret;
+ goto put_child;
phy_set_drvdata(phy, miphy_dev->phys[port]);
port++;
provider = devm_of_phy_provider_register(&pdev->dev, miphy28lp_xlate);
return PTR_ERR_OR_ZERO(provider);
+put_child:
+ of_node_put(child);
+ return ret;
}
static const struct of_device_id miphy28lp_of_match[] = {
miphy_phy = devm_kzalloc(&pdev->dev, sizeof(*miphy_phy),
GFP_KERNEL);
- if (!miphy_phy)
- return -ENOMEM;
+ if (!miphy_phy) {
+ ret = -ENOMEM;
+ goto put_child;
+ }
miphy_dev->phys[port] = miphy_phy;
phy = devm_phy_create(&pdev->dev, child, &miphy365x_ops);
if (IS_ERR(phy)) {
dev_err(&pdev->dev, "failed to create PHY\n");
- return PTR_ERR(phy);
+ ret = PTR_ERR(phy);
+ goto put_child;
}
miphy_dev->phys[port]->phy = phy;
ret = miphy365x_of_probe(child, miphy_phy);
if (ret)
- return ret;
+ goto put_child;
phy_set_drvdata(phy, miphy_dev->phys[port]);
&miphy_phy->ctrlreg);
if (ret) {
dev_err(&pdev->dev, "No sysconfig offset found\n");
- return ret;
+ goto put_child;
}
}
provider = devm_of_phy_provider_register(&pdev->dev, miphy365x_xlate);
return PTR_ERR_OR_ZERO(provider);
+put_child:
+ of_node_put(child);
+ return ret;
}
static const struct of_device_id miphy365x_of_match[] = {
struct resource *sif_res;
struct mt65xx_u3phy *u3phy;
struct resource res;
- int port;
+ int port, retval;
u3phy = devm_kzalloc(dev, sizeof(*u3phy), GFP_KERNEL);
if (!u3phy)
for_each_child_of_node(np, child_np) {
struct mt65xx_phy_instance *instance;
struct phy *phy;
- int retval;
instance = devm_kzalloc(dev, sizeof(*instance), GFP_KERNEL);
- if (!instance)
- return -ENOMEM;
+ if (!instance) {
+ retval = -ENOMEM;
+ goto put_child;
+ }
u3phy->phys[port] = instance;
phy = devm_phy_create(dev, child_np, &mt65xx_u3phy_ops);
if (IS_ERR(phy)) {
dev_err(dev, "failed to create phy\n");
- return PTR_ERR(phy);
+ retval = PTR_ERR(phy);
+ goto put_child;
}
retval = of_address_to_resource(child_np, 0, &res);
if (retval) {
dev_err(dev, "failed to get address resource(id-%d)\n",
port);
- return retval;
+ goto put_child;
}
instance->port_base = devm_ioremap_resource(&phy->dev, &res);
if (IS_ERR(instance->port_base)) {
dev_err(dev, "failed to remap phy regs\n");
- return PTR_ERR(instance->port_base);
+ retval = PTR_ERR(instance->port_base);
+ goto put_child;
}
instance->phy = phy;
provider = devm_of_phy_provider_register(dev, mt65xx_phy_xlate);
return PTR_ERR_OR_ZERO(provider);
+put_child:
+ of_node_put(child_np);
+ return retval;
}
static const struct of_device_id mt65xx_u3phy_id_table[] = {
for_each_available_child_of_node(dev->of_node, child) {
rk_phy = devm_kzalloc(dev, sizeof(*rk_phy), GFP_KERNEL);
- if (!rk_phy)
- return -ENOMEM;
+ if (!rk_phy) {
+ err = -ENOMEM;
+ goto put_child;
+ }
if (of_property_read_u32(child, "reg", ®_offset)) {
dev_err(dev, "missing reg property in node %s\n",
child->name);
- return -EINVAL;
+ err = -EINVAL;
+ goto put_child;
}
rk_phy->reg_offset = reg_offset;
rk_phy->phy = devm_phy_create(dev, child, &ops);
if (IS_ERR(rk_phy->phy)) {
dev_err(dev, "failed to create PHY\n");
- return PTR_ERR(rk_phy->phy);
+ err = PTR_ERR(rk_phy->phy);
+ goto put_child;
}
phy_set_drvdata(rk_phy->phy, rk_phy);
/* only power up usb phy when it use, so disable it when init*/
err = rockchip_usb_phy_power(rk_phy, 1);
if (err)
- return err;
+ goto put_child;
}
phy_provider = devm_of_phy_provider_register(dev, of_phy_simple_xlate);
return PTR_ERR_OR_ZERO(phy_provider);
+put_child:
+ of_node_put(child);
+ return err;
}
static const struct of_device_id rockchip_usb_phy_dt_ids[] = {
return bcm2835_gpio_get_bit(pc, GPLEV0, offset);
}
-static int bcm2835_gpio_direction_output(struct gpio_chip *chip,
- unsigned offset, int value)
-{
- return pinctrl_gpio_direction_output(chip->base + offset);
-}
-
static void bcm2835_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
{
struct bcm2835_pinctrl *pc = dev_get_drvdata(chip->dev);
bcm2835_gpio_set_bit(pc, value ? GPSET0 : GPCLR0, offset);
}
+static int bcm2835_gpio_direction_output(struct gpio_chip *chip,
+ unsigned offset, int value)
+{
+ bcm2835_gpio_set(chip, offset, value);
+ return pinctrl_gpio_direction_output(chip->base + offset);
+}
+
static int bcm2835_gpio_to_irq(struct gpio_chip *chip, unsigned offset)
{
struct bcm2835_pinctrl *pc = dev_get_drvdata(chip->dev);
static struct imx_pinctrl_soc_info vf610_pinctrl_info = {
.pins = vf610_pinctrl_pads,
.npins = ARRAY_SIZE(vf610_pinctrl_pads),
- .flags = SHARE_MUX_CONF_REG,
+ .flags = SHARE_MUX_CONF_REG | ZERO_OFFSET_VALID,
};
static const struct of_device_id vf610_pinctrl_of_match[] = {
.padcfglock_offset = BXT_PADCFGLOCK, \
.hostown_offset = BXT_HOSTSW_OWN, \
.ie_offset = BXT_GPI_IE, \
+ .gpp_size = 32, \
.pin_base = (s), \
.npins = ((e) - (s) + 1), \
}
#include "pinctrl-intel.h"
-/* Maximum number of pads in each group */
-#define NPADS_IN_GPP 24
-
/* Offset from regs */
#define PADBAR 0x00c
#define GPI_IS 0x100
#define PADOWN_BITS 4
#define PADOWN_SHIFT(p) ((p) % 8 * PADOWN_BITS)
#define PADOWN_MASK(p) (0xf << PADOWN_SHIFT(p))
+#define PADOWN_GPP(p) ((p) / 8)
/* Offset from pad_regs */
#define PADCFG0 0x000
static bool intel_pad_owned_by_host(struct intel_pinctrl *pctrl, unsigned pin)
{
const struct intel_community *community;
- unsigned padno, gpp, gpp_offset, offset;
+ unsigned padno, gpp, offset, group;
void __iomem *padown;
community = intel_get_community(pctrl, pin);
return true;
padno = pin_to_padno(community, pin);
- gpp = padno / NPADS_IN_GPP;
- gpp_offset = padno % NPADS_IN_GPP;
- offset = community->padown_offset + gpp * 16 + (gpp_offset / 8) * 4;
+ group = padno / community->gpp_size;
+ gpp = PADOWN_GPP(padno % community->gpp_size);
+ offset = community->padown_offset + 0x10 * group + gpp * 4;
padown = community->regs + offset;
return !(readl(padown) & PADOWN_MASK(padno));
return false;
padno = pin_to_padno(community, pin);
- gpp = padno / NPADS_IN_GPP;
+ gpp = padno / community->gpp_size;
offset = community->hostown_offset + gpp * 4;
hostown = community->regs + offset;
- return !(readl(hostown) & BIT(padno % NPADS_IN_GPP));
+ return !(readl(hostown) & BIT(padno % community->gpp_size));
}
static bool intel_pad_locked(struct intel_pinctrl *pctrl, unsigned pin)
return false;
padno = pin_to_padno(community, pin);
- gpp = padno / NPADS_IN_GPP;
+ gpp = padno / community->gpp_size;
/*
* If PADCFGLOCK and PADCFGLOCKTX bits are both clear for this pad,
*/
offset = community->padcfglock_offset + gpp * 8;
value = readl(community->regs + offset);
- if (value & BIT(pin % NPADS_IN_GPP))
+ if (value & BIT(pin % community->gpp_size))
return true;
offset = community->padcfglock_offset + 4 + gpp * 8;
value = readl(community->regs + offset);
- if (value & BIT(pin % NPADS_IN_GPP))
+ if (value & BIT(pin % community->gpp_size))
return true;
return false;
community = intel_get_community(pctrl, pin);
if (community) {
unsigned padno = pin_to_padno(community, pin);
- unsigned gpp_offset = padno % NPADS_IN_GPP;
- unsigned gpp = padno / NPADS_IN_GPP;
+ unsigned gpp_offset = padno % community->gpp_size;
+ unsigned gpp = padno / community->gpp_size;
writel(BIT(gpp_offset), community->regs + GPI_IS + gpp * 4);
}
community = intel_get_community(pctrl, pin);
if (community) {
unsigned padno = pin_to_padno(community, pin);
- unsigned gpp_offset = padno % NPADS_IN_GPP;
- unsigned gpp = padno / NPADS_IN_GPP;
+ unsigned gpp_offset = padno % community->gpp_size;
+ unsigned gpp = padno / community->gpp_size;
void __iomem *reg;
u32 value;
return -EINVAL;
padno = pin_to_padno(community, pin);
- gpp = padno / NPADS_IN_GPP;
- gpp_offset = padno % NPADS_IN_GPP;
+ gpp = padno / community->gpp_size;
+ gpp_offset = padno % community->gpp_size;
/* Clear the existing wake status */
writel(BIT(gpp_offset), community->regs + GPI_GPE_STS + gpp * 4);
/* Only interrupts that are enabled */
pending &= enabled;
- for_each_set_bit(gpp_offset, &pending, NPADS_IN_GPP) {
+ for_each_set_bit(gpp_offset, &pending, community->gpp_size) {
unsigned padno, irq;
/*
* The last group in community can have less pins
* than NPADS_IN_GPP.
*/
- padno = gpp_offset + gpp * NPADS_IN_GPP;
+ padno = gpp_offset + gpp * community->gpp_size;
if (padno >= community->npins)
break;
community->regs = regs;
community->pad_regs = regs + padbar;
- community->ngpps = DIV_ROUND_UP(community->npins, NPADS_IN_GPP);
+ community->ngpps = DIV_ROUND_UP(community->npins,
+ community->gpp_size);
}
irq = platform_get_irq(pdev, 0);
* ACPI).
* @ie_offset: Register offset of GPI_IE from @regs.
* @pin_base: Starting pin of pins in this community
+ * @gpp_size: Maximum number of pads in each group, such as PADCFGLOCK,
+ * HOSTSW_OWN, GPI_IS, GPI_IE, etc.
* @npins: Number of pins in this community
* @regs: Community specific common registers (reserved for core driver)
* @pad_regs: Community specific pad registers (reserved for core driver)
unsigned hostown_offset;
unsigned ie_offset;
unsigned pin_base;
+ unsigned gpp_size;
size_t npins;
void __iomem *regs;
void __iomem *pad_regs;
.padcfglock_offset = SPT_PADCFGLOCK, \
.hostown_offset = SPT_HOSTSW_OWN, \
.ie_offset = SPT_GPI_IE, \
+ .gpp_size = 24, \
.pin_base = (s), \
.npins = ((e) - (s) + 1), \
}
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
/* check if the domain is locked by BIOS */
- if (rapl_read_data_raw(rd, FW_LOCK, false, &locked)) {
+ ret = rapl_read_data_raw(rd, FW_LOCK, false, &locked);
+ if (ret)
+ return ret;
+ if (locked) {
pr_info("RAPL package %d domain %s locked by BIOS\n",
rp->id, rd->name);
- rd->state |= DOMAIN_STATE_BIOS_LOCKED;
+ rd->state |= DOMAIN_STATE_BIOS_LOCKED;
}
}
platform_set_drvdata(pdev, rtc);
+ rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, DA9063_DRVNAME_RTC,
+ &da9063_rtc_ops, THIS_MODULE);
+ if (IS_ERR(rtc->rtc_dev))
+ return PTR_ERR(rtc->rtc_dev);
+
+ da9063_data_to_tm(data, &rtc->alarm_time, rtc);
+ rtc->rtc_sync = false;
+
irq_alarm = platform_get_irq_byname(pdev, "ALARM");
ret = devm_request_threaded_irq(&pdev->dev, irq_alarm, NULL,
da9063_alarm_event,
IRQF_TRIGGER_LOW | IRQF_ONESHOT,
"ALARM", rtc);
- if (ret) {
+ if (ret)
dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n",
irq_alarm, ret);
- return ret;
- }
-
- rtc->rtc_dev = devm_rtc_device_register(&pdev->dev, DA9063_DRVNAME_RTC,
- &da9063_rtc_ops, THIS_MODULE);
- if (IS_ERR(rtc->rtc_dev))
- return PTR_ERR(rtc->rtc_dev);
- da9063_data_to_tm(data, &rtc->alarm_time, rtc);
- rtc->rtc_sync = false;
return ret;
}
int irq;
};
+/*
+ * The Rockchip calendar used by the RK808 counts November with 31 days. We use
+ * these translation functions to convert its dates to/from the Gregorian
+ * calendar used by the rest of the world. We arbitrarily define Jan 1st, 2016
+ * as the day when both calendars were in sync, and treat all other dates
+ * relative to that.
+ * NOTE: Other system software (e.g. firmware) that reads the same hardware must
+ * implement this exact same conversion algorithm, with the same anchor date.
+ */
+static time64_t nov2dec_transitions(struct rtc_time *tm)
+{
+ return (tm->tm_year + 1900) - 2016 + (tm->tm_mon + 1 > 11 ? 1 : 0);
+}
+
+static void rockchip_to_gregorian(struct rtc_time *tm)
+{
+ /* If it's Nov 31st, rtc_tm_to_time64() will count that like Dec 1st */
+ time64_t time = rtc_tm_to_time64(tm);
+ rtc_time64_to_tm(time + nov2dec_transitions(tm) * 86400, tm);
+}
+
+static void gregorian_to_rockchip(struct rtc_time *tm)
+{
+ time64_t extra_days = nov2dec_transitions(tm);
+ time64_t time = rtc_tm_to_time64(tm);
+ rtc_time64_to_tm(time - extra_days * 86400, tm);
+
+ /* Compensate if we went back over Nov 31st (will work up to 2381) */
+ if (nov2dec_transitions(tm) < extra_days) {
+ if (tm->tm_mon + 1 == 11)
+ tm->tm_mday++; /* This may result in 31! */
+ else
+ rtc_time64_to_tm(time - (extra_days - 1) * 86400, tm);
+ }
+}
+
/* Read current time and date in RTC */
static int rk808_rtc_readtime(struct device *dev, struct rtc_time *tm)
{
tm->tm_mon = (bcd2bin(rtc_data[4] & MONTHS_REG_MSK)) - 1;
tm->tm_year = (bcd2bin(rtc_data[5] & YEARS_REG_MSK)) + 100;
tm->tm_wday = bcd2bin(rtc_data[6] & WEEKS_REG_MSK);
+ rockchip_to_gregorian(tm);
dev_dbg(dev, "RTC date/time %4d-%02d-%02d(%d) %02d:%02d:%02d\n",
1900 + tm->tm_year, tm->tm_mon + 1, tm->tm_mday,
- tm->tm_wday, tm->tm_hour , tm->tm_min, tm->tm_sec);
+ tm->tm_wday, tm->tm_hour, tm->tm_min, tm->tm_sec);
return ret;
}
u8 rtc_data[NUM_TIME_REGS];
int ret;
+ dev_dbg(dev, "set RTC date/time %4d-%02d-%02d(%d) %02d:%02d:%02d\n",
+ 1900 + tm->tm_year, tm->tm_mon + 1, tm->tm_mday,
+ tm->tm_wday, tm->tm_hour, tm->tm_min, tm->tm_sec);
+ gregorian_to_rockchip(tm);
rtc_data[0] = bin2bcd(tm->tm_sec);
rtc_data[1] = bin2bcd(tm->tm_min);
rtc_data[2] = bin2bcd(tm->tm_hour);
rtc_data[4] = bin2bcd(tm->tm_mon + 1);
rtc_data[5] = bin2bcd(tm->tm_year - 100);
rtc_data[6] = bin2bcd(tm->tm_wday);
- dev_dbg(dev, "set RTC date/time %4d-%02d-%02d(%d) %02d:%02d:%02d\n",
- 1900 + tm->tm_year, tm->tm_mon + 1, tm->tm_mday,
- tm->tm_wday, tm->tm_hour , tm->tm_min, tm->tm_sec);
/* Stop RTC while updating the RTC registers */
ret = regmap_update_bits(rk808->regmap, RK808_RTC_CTRL_REG,
alrm->time.tm_mday = bcd2bin(alrm_data[3] & DAYS_REG_MSK);
alrm->time.tm_mon = (bcd2bin(alrm_data[4] & MONTHS_REG_MSK)) - 1;
alrm->time.tm_year = (bcd2bin(alrm_data[5] & YEARS_REG_MSK)) + 100;
+ rockchip_to_gregorian(&alrm->time);
ret = regmap_read(rk808->regmap, RK808_RTC_INT_REG, &int_reg);
if (ret) {
alrm->time.tm_mday, alrm->time.tm_wday, alrm->time.tm_hour,
alrm->time.tm_min, alrm->time.tm_sec);
+ gregorian_to_rockchip(&alrm->time);
alrm_data[0] = bin2bcd(alrm->time.tm_sec);
alrm_data[1] = bin2bcd(alrm->time.tm_min);
alrm_data[2] = bin2bcd(alrm->time.tm_hour);
status = ap_sm_recv(ap_dev);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
- if (ap_dev->queue_count > 0)
+ if (ap_dev->queue_count > 0) {
+ ap_dev->state = AP_STATE_WORKING;
return AP_WAIT_AGAIN;
+ }
ap_dev->state = AP_STATE_IDLE;
return AP_WAIT_NONE;
case AP_RESPONSE_NO_PENDING_REPLY:
return vq;
}
+static void virtio_ccw_check_activity(struct virtio_ccw_device *vcdev,
+ __u32 activity)
+{
+ if (vcdev->curr_io & activity) {
+ switch (activity) {
+ case VIRTIO_CCW_DOING_READ_FEAT:
+ case VIRTIO_CCW_DOING_WRITE_FEAT:
+ case VIRTIO_CCW_DOING_READ_CONFIG:
+ case VIRTIO_CCW_DOING_WRITE_CONFIG:
+ case VIRTIO_CCW_DOING_WRITE_STATUS:
+ case VIRTIO_CCW_DOING_SET_VQ:
+ case VIRTIO_CCW_DOING_SET_IND:
+ case VIRTIO_CCW_DOING_SET_CONF_IND:
+ case VIRTIO_CCW_DOING_RESET:
+ case VIRTIO_CCW_DOING_READ_VQ_CONF:
+ case VIRTIO_CCW_DOING_SET_IND_ADAPTER:
+ case VIRTIO_CCW_DOING_SET_VIRTIO_REV:
+ vcdev->curr_io &= ~activity;
+ wake_up(&vcdev->wait_q);
+ break;
+ default:
+ /* don't know what to do... */
+ dev_warn(&vcdev->cdev->dev,
+ "Suspicious activity '%08x'\n", activity);
+ WARN_ON(1);
+ break;
+ }
+ }
+}
+
static void virtio_ccw_int_handler(struct ccw_device *cdev,
unsigned long intparm,
struct irb *irb)
if (!vcdev)
return;
+ if (IS_ERR(irb)) {
+ vcdev->err = PTR_ERR(irb);
+ virtio_ccw_check_activity(vcdev, activity);
+ /* Don't poke around indicators, something's wrong. */
+ return;
+ }
/* Check if it's a notification from the host. */
if ((intparm == 0) &&
(scsw_stctl(&irb->scsw) ==
/* Map everything else to -EIO. */
vcdev->err = -EIO;
}
- if (vcdev->curr_io & activity) {
- switch (activity) {
- case VIRTIO_CCW_DOING_READ_FEAT:
- case VIRTIO_CCW_DOING_WRITE_FEAT:
- case VIRTIO_CCW_DOING_READ_CONFIG:
- case VIRTIO_CCW_DOING_WRITE_CONFIG:
- case VIRTIO_CCW_DOING_WRITE_STATUS:
- case VIRTIO_CCW_DOING_SET_VQ:
- case VIRTIO_CCW_DOING_SET_IND:
- case VIRTIO_CCW_DOING_SET_CONF_IND:
- case VIRTIO_CCW_DOING_RESET:
- case VIRTIO_CCW_DOING_READ_VQ_CONF:
- case VIRTIO_CCW_DOING_SET_IND_ADAPTER:
- case VIRTIO_CCW_DOING_SET_VIRTIO_REV:
- vcdev->curr_io &= ~activity;
- wake_up(&vcdev->wait_q);
- break;
- default:
- /* don't know what to do... */
- dev_warn(&cdev->dev, "Suspicious activity '%08x'\n",
- activity);
- WARN_ON(1);
- break;
- }
- }
+ virtio_ccw_check_activity(vcdev, activity);
for_each_set_bit(i, &vcdev->indicators,
sizeof(vcdev->indicators) * BITS_PER_BYTE) {
/* The bit clear must happen before the vring kick. */
struct scsi_device *sdev = to_scsi_device(dev);
int err = 0;
- if (pm && pm->runtime_suspend) {
- err = blk_pre_runtime_suspend(sdev->request_queue);
- if (err)
- return err;
+ err = blk_pre_runtime_suspend(sdev->request_queue);
+ if (err)
+ return err;
+ if (pm && pm->runtime_suspend)
err = pm->runtime_suspend(dev);
- blk_post_runtime_suspend(sdev->request_queue, err);
- }
+ blk_post_runtime_suspend(sdev->request_queue, err);
+
return err;
}
const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
int err = 0;
- if (pm && pm->runtime_resume) {
- blk_pre_runtime_resume(sdev->request_queue);
+ blk_pre_runtime_resume(sdev->request_queue);
+ if (pm && pm->runtime_resume)
err = pm->runtime_resume(dev);
- blk_post_runtime_resume(sdev->request_queue, err);
- }
+ blk_post_runtime_resume(sdev->request_queue, err);
+
return err;
}
/*
* Use the device's preferred I/O size for reads and writes
- * unless the reported value is unreasonably large (or garbage).
+ * unless the reported value is unreasonably small, large, or
+ * garbage.
*/
- if (sdkp->opt_xfer_blocks && sdkp->opt_xfer_blocks <= dev_max &&
- sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS)
+ if (sdkp->opt_xfer_blocks &&
+ sdkp->opt_xfer_blocks <= dev_max &&
+ sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS &&
+ sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_CACHE_SIZE)
rw_max = q->limits.io_opt =
logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
else
static int ses_recv_diag(struct scsi_device *sdev, int page_code,
void *buf, int bufflen)
{
+ int ret;
unsigned char cmd[] = {
RECEIVE_DIAGNOSTIC,
1, /* Set PCV bit */
bufflen & 0xff,
0
};
+ unsigned char recv_page_code;
- return scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen,
+ ret = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, bufflen,
NULL, SES_TIMEOUT, SES_RETRIES, NULL);
+ if (unlikely(!ret))
+ return ret;
+
+ recv_page_code = ((unsigned char *)buf)[0];
+
+ if (likely(recv_page_code == page_code))
+ return ret;
+
+ /* successful diagnostic but wrong page code. This happens to some
+ * USB devices, just print a message and pretend there was an error */
+
+ sdev_printk(KERN_ERR, sdev,
+ "Wrong diagnostic page; asked for %d got %u\n",
+ page_code, recv_page_code);
+
+ return -EINVAL;
}
static int ses_send_diag(struct scsi_device *sdev, int page_code,
if (desc_ptr)
desc_ptr += len;
- if (addl_desc_ptr)
+ if (addl_desc_ptr &&
+ /* only find additional descriptions for specific devices */
+ (type_ptr[0] == ENCLOSURE_COMPONENT_DEVICE ||
+ type_ptr[0] == ENCLOSURE_COMPONENT_ARRAY_DEVICE ||
+ type_ptr[0] == ENCLOSURE_COMPONENT_SAS_EXPANDER ||
+ /* these elements are optional */
+ type_ptr[0] == ENCLOSURE_COMPONENT_SCSI_TARGET_PORT ||
+ type_ptr[0] == ENCLOSURE_COMPONENT_SCSI_INITIATOR_PORT ||
+ type_ptr[0] == ENCLOSURE_COMPONENT_CONTROLLER_ELECTRONICS))
addl_desc_ptr += addl_desc_ptr[1] + 2;
}
{
unsigned int val;
- regmap_read(dspi->regmap, SPI_CTAR(dspi->cs), &val);
+ regmap_read(dspi->regmap, SPI_CTAR(0), &val);
return ((val & SPI_FRAME_BITS_MASK) == SPI_FRAME_BITS(8)) ? 0 : 1;
}
return SPI_PUSHR_TXDATA(d16) |
SPI_PUSHR_PCS(dspi->cs) |
- SPI_PUSHR_CTAS(dspi->cs) |
+ SPI_PUSHR_CTAS(0) |
SPI_PUSHR_CONT;
}
*/
if (tx_word && (dspi->len == 1)) {
dspi->dataflags |= TRAN_STATE_WORD_ODD_NUM;
- regmap_update_bits(dspi->regmap, SPI_CTAR(dspi->cs),
+ regmap_update_bits(dspi->regmap, SPI_CTAR(0),
SPI_FRAME_BITS_MASK, SPI_FRAME_BITS(8));
tx_word = 0;
}
if (tx_word && (dspi->len == 1)) {
dspi->dataflags |= TRAN_STATE_WORD_ODD_NUM;
- regmap_update_bits(dspi->regmap, SPI_CTAR(dspi->cs),
+ regmap_update_bits(dspi->regmap, SPI_CTAR(0),
SPI_FRAME_BITS_MASK, SPI_FRAME_BITS(8));
tx_word = 0;
}
regmap_update_bits(dspi->regmap, SPI_MCR,
SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF,
SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF);
- regmap_write(dspi->regmap, SPI_CTAR(dspi->cs),
+ regmap_write(dspi->regmap, SPI_CTAR(0),
dspi->cur_chip->ctar_val);
trans_mode = dspi->devtype_data->trans_mode;
if (!dspi->len) {
if (dspi->dataflags & TRAN_STATE_WORD_ODD_NUM) {
regmap_update_bits(dspi->regmap,
- SPI_CTAR(dspi->cs),
+ SPI_CTAR(0),
SPI_FRAME_BITS_MASK,
SPI_FRAME_BITS(16));
dspi->dataflags &= ~TRAN_STATE_WORD_ODD_NUM;
master->bus_num = -1;
master->num_chipselect = 1;
master->dev.class = &spi_master_class;
- master->dev.parent = get_device(dev);
+ master->dev.parent = dev;
spi_master_set_devdata(master, &master[1]);
return master;
kfree(spidev->rx_buffer);
spidev->rx_buffer = NULL;
+ spin_lock_irq(&spidev->spi_lock);
if (spidev->spi)
spidev->speed_hz = spidev->spi->max_speed_hz;
/* ... after we unbound from the underlying device? */
- spin_lock_irq(&spidev->spi_lock);
dofree = (spidev->spi == NULL);
spin_unlock_irq(&spidev->spi_lock);
err:
sg = table->sgl;
for (i -= 1; i >= 0; i--) {
- gen_pool_free(chunk_heap->pool, sg_phys(sg) & PAGE_MASK,
+ gen_pool_free(chunk_heap->pool, page_to_phys(sg_page(sg)),
sg->length);
sg = sg_next(sg);
}
DMA_BIDIRECTIONAL);
for_each_sg(table->sgl, sg, table->nents, i) {
- gen_pool_free(chunk_heap->pool, sg_phys(sg) & PAGE_MASK,
+ gen_pool_free(chunk_heap->pool, page_to_phys(sg_page(sg)),
sg->length);
}
chunk_heap->allocated -= allocated_size;
return rc;
}
-static const char *ll_follow_link(struct dentry *dentry, void **cookie)
+static void ll_put_link(void *p)
+{
+ ptlrpc_req_finished(p);
+}
+
+static const char *ll_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct ptlrpc_request *request = NULL;
int rc;
char *symname = NULL;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
CDEBUG(D_VFSTRACE, "VFS Op\n");
ll_inode_size_lock(inode);
}
/* symname may contain a pointer to the request message buffer,
- * we delay request releasing until ll_put_link then.
+ * we delay request releasing then.
*/
- *cookie = request;
+ set_delayed_call(done, ll_put_link, request);
return symname;
}
-static void ll_put_link(struct inode *unused, void *cookie)
-{
- ptlrpc_req_finished(cookie);
-}
-
struct inode_operations ll_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.setattr = ll_setattr,
- .follow_link = ll_follow_link,
- .put_link = ll_put_link,
+ .get_link = ll_get_link,
.getattr = ll_getattr,
.permission = ll_inode_permission,
.setxattr = ll_setxattr,
static
int get_xattr_type(const char *name)
{
- if (!strcmp(name, POSIX_ACL_XATTR_ACCESS))
+ if (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS))
return XATTR_ACL_ACCESS_T;
- if (!strcmp(name, POSIX_ACL_XATTR_DEFAULT))
+ if (!strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT))
return XATTR_ACL_DEFAULT_T;
if (!strncmp(name, XATTR_USER_PREFIX,
size_t eol;
size_t tail;
int ret, found = 0;
- bool eof_push = 0;
/* N.B. avoid overrun if nr == 0 */
- n = min(*nr, smp_load_acquire(&ldata->canon_head) - ldata->read_tail);
- if (!n)
+ if (!*nr)
return 0;
+ n = min(*nr + 1, smp_load_acquire(&ldata->canon_head) - ldata->read_tail);
+
tail = ldata->read_tail & (N_TTY_BUF_SIZE - 1);
size = min_t(size_t, tail + n, N_TTY_BUF_SIZE);
n = eol - tail;
if (n > N_TTY_BUF_SIZE)
n += N_TTY_BUF_SIZE;
- n += found;
- c = n;
+ c = n + found;
- if (found && !ldata->push && read_buf(ldata, eol) == __DISABLED_CHAR) {
- n--;
- eof_push = !n && ldata->read_tail != ldata->line_start;
+ if (!found || read_buf(ldata, eol) != __DISABLED_CHAR) {
+ c = min(*nr, c);
+ n = c;
}
n_tty_trace("%s: eol:%zu found:%d n:%zu c:%zu size:%zu more:%zu\n",
ldata->push = 0;
tty_audit_push(tty);
}
- return eof_push ? -EAGAIN : 0;
+ return 0;
}
extern ssize_t redirected_tty_write(struct file *, const char __user *,
if (ldata->icanon && !L_EXTPROC(tty)) {
retval = canon_copy_from_read_buf(tty, &b, &nr);
- if (retval == -EAGAIN) {
- retval = 0;
- continue;
- } else if (retval)
+ if (retval)
break;
} else {
int uncopied;
*/
static int uniphier_serial_dl_read(struct uart_8250_port *up)
{
- return readl(up->port.membase + UNIPHIER_UART_DLR);
+ int offset = UNIPHIER_UART_DLR << up->port.regshift;
+
+ return readl(up->port.membase + offset);
}
static void uniphier_serial_dl_write(struct uart_8250_port *up, int value)
{
- writel(value, up->port.membase + UNIPHIER_UART_DLR);
+ int offset = UNIPHIER_UART_DLR << up->port.regshift;
+
+ writel(value, up->port.membase + offset);
}
static int uniphier_of_serial_setup(struct device *dev, struct uart_port *port,
if (buf && !parse_options(&early_console_dev, buf))
buf = NULL;
+ spin_lock_init(&port->lock);
port->uartclk = BASE_BAUD * 16;
if (port->mapbase)
port->membase = earlycon_map(port->mapbase, 64);
int err;
struct uart_port *port = &early_console_dev.port;
+ spin_lock_init(&port->lock);
port->iotype = UPIO_MEM;
port->mapbase = addr;
port->uartclk = BASE_BAUD * 16;
sg_init_table(sg, 1);
s->rx_buf[i] = buf;
sg_dma_address(sg) = dma;
- sg->length = s->buf_len_rx;
+ sg_dma_len(sg) = s->buf_len_rx;
buf += s->buf_len_rx;
dma += s->buf_len_rx;
uart_handle_dcd_change(port, 1);
}
- for (i = 0; i < bytes_read; i++)
- uart_handle_sysrq_char(port, con_read_page[i]);
+ if (port->sysrq != 0 && *con_read_page) {
+ for (i = 0; i < bytes_read; i++)
+ uart_handle_sysrq_char(port, con_read_page[i]);
+ }
if (port->state == NULL)
continue;
int (*receive_chars)(struct uart_port *port);
};
-static struct sunhv_ops bychar_ops = {
+static const struct sunhv_ops bychar_ops = {
.transmit_chars = transmit_chars_putchar,
.receive_chars = receive_chars_getchar,
};
-static struct sunhv_ops bywrite_ops = {
+static const struct sunhv_ops bywrite_ops = {
.transmit_chars = transmit_chars_write,
.receive_chars = receive_chars_read,
};
-static struct sunhv_ops *sunhv_ops = &bychar_ops;
+static const struct sunhv_ops *sunhv_ops = &bychar_ops;
static struct tty_port *receive_chars(struct uart_port *port)
{
{
char *killer = NULL;
+ /* we need to release the RCU read lock here,
+ * otherwise we get an annoying
+ * 'BUG: sleeping function called from invalid context'
+ * complaint from the kernel before the panic.
+ */
+ rcu_read_unlock();
panic_on_oops = 1; /* force panic */
wmb();
*killer = 1;
count = disc->ops->receive_buf2(tty, p, f, count);
else {
count = min_t(int, count, tty->receive_room);
- if (count)
+ if (count && disc->ops->receive_buf)
disc->ops->receive_buf(tty, p, f, count);
}
return count;
unsigned delay;
/* Continue a partial initialization */
- if (type == HUB_INIT2)
- goto init2;
- if (type == HUB_INIT3)
+ if (type == HUB_INIT2 || type == HUB_INIT3) {
+ device_lock(hub->intfdev);
+
+ /* Was the hub disconnected while we were waiting? */
+ if (hub->disconnected) {
+ device_unlock(hub->intfdev);
+ kref_put(&hub->kref, hub_release);
+ return;
+ }
+ if (type == HUB_INIT2)
+ goto init2;
goto init3;
+ }
+ kref_get(&hub->kref);
/* The superspeed hub except for root hub has to use Hub Depth
* value as an offset into the route string to locate the bits
queue_delayed_work(system_power_efficient_wq,
&hub->init_work,
msecs_to_jiffies(delay));
+ device_unlock(hub->intfdev);
return; /* Continues at init3: below */
} else {
msleep(delay);
/* Allow autosuspend if it was suppressed */
if (type <= HUB_INIT3)
usb_autopm_put_interface_async(to_usb_interface(hub->intfdev));
+
+ if (type == HUB_INIT2 || type == HUB_INIT3)
+ device_unlock(hub->intfdev);
+
+ kref_put(&hub->kref, hub_release);
}
/* Implement the continuations for the delays above */
* through. Since this has a reasonably high failure rate, we retry
* several times.
*/
- while (retries--) {
+ while (retries) {
+ retries--;
result = usb_control_msg(serial->dev,
usb_sndctrlpipe(serial->dev, 0), 0x22, 0x21,
0x1, 0, NULL, 0, 100);
port = FSL_DIU_PORT_DLVDS;
}
- return diu_ops.valid_monitor_port(port);
+ if (diu_ops.valid_monitor_port)
+ port = diu_ops.valid_monitor_port(port);
+
+ return port;
}
/*
#else
monitor_port = fsl_diu_name_to_port(monitor_string);
#endif
+
+ /*
+ * Must to verify set_pixel_clock. If not implement on platform,
+ * then that means that there is no platform support for the DIU.
+ */
+ if (!diu_ops.set_pixel_clock)
+ return -ENODEV;
+
pr_info("Freescale Display Interface Unit (DIU) framebuffer driver\n");
#ifdef CONFIG_NOT_COHERENT_CACHE
.vbp = 41,
.interlace = true,
+
+ .hsync_level = OMAPDSS_SIG_ACTIVE_LOW,
+ .vsync_level = OMAPDSS_SIG_ACTIVE_LOW,
+ .data_pclk_edge = OMAPDSS_DRIVE_SIG_RISING_EDGE,
+ .de_level = OMAPDSS_SIG_ACTIVE_HIGH,
+ .sync_pclk_edge = OMAPDSS_DRIVE_SIG_FALLING_EDGE,
};
EXPORT_SYMBOL(omap_dss_pal_timings);
.vbp = 31,
.interlace = true,
+
+ .hsync_level = OMAPDSS_SIG_ACTIVE_LOW,
+ .vsync_level = OMAPDSS_SIG_ACTIVE_LOW,
+ .data_pclk_edge = OMAPDSS_DRIVE_SIG_RISING_EDGE,
+ .de_level = OMAPDSS_SIG_ACTIVE_HIGH,
+ .sync_pclk_edge = OMAPDSS_DRIVE_SIG_FALLING_EDGE,
};
EXPORT_SYMBOL(omap_dss_ntsc_timings);
static void consume_one_event(unsigned cpu,
struct evtchn_fifo_control_block *control_block,
- unsigned priority, unsigned long *ready)
+ unsigned priority, unsigned long *ready,
+ bool drop)
{
struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
uint32_t head;
if (head == 0)
clear_bit(priority, ready);
- if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port))
- handle_irq_for_port(port);
+ if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) {
+ if (unlikely(drop))
+ pr_warn("Dropping pending event for port %u\n", port);
+ else
+ handle_irq_for_port(port);
+ }
q->head[priority] = head;
}
-static void evtchn_fifo_handle_events(unsigned cpu)
+static void __evtchn_fifo_handle_events(unsigned cpu, bool drop)
{
struct evtchn_fifo_control_block *control_block;
unsigned long ready;
while (ready) {
q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES);
- consume_one_event(cpu, control_block, q, &ready);
+ consume_one_event(cpu, control_block, q, &ready, drop);
ready |= xchg(&control_block->ready, 0);
}
}
+static void evtchn_fifo_handle_events(unsigned cpu)
+{
+ __evtchn_fifo_handle_events(cpu, false);
+}
+
static void evtchn_fifo_resume(void)
{
unsigned cpu;
if (!per_cpu(cpu_control_block, cpu))
ret = evtchn_fifo_alloc_control_block(cpu);
break;
+ case CPU_DEAD:
+ __evtchn_fifo_handle_events(cpu, true);
+ break;
default:
break;
}
struct xen_pci_sharedinfo *sh_info;
unsigned long flags;
struct work_struct op_work;
+ struct xen_pci_op op;
};
struct xen_pcibk_dev_data {
enable ? "enable" : "disable");
if (enable) {
+ /*
+ * The MSI or MSI-X should not have an IRQ handler. Otherwise
+ * if the guest terminates we BUG_ON in free_msi_irqs.
+ */
+ if (dev->msi_enabled || dev->msix_enabled)
+ goto out;
+
rc = request_irq(dev_data->irq,
xen_pcibk_guest_interrupt, IRQF_SHARED,
dev_data->irq_name, dev);
if (unlikely(verbose_request))
printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev));
- status = pci_enable_msi(dev);
+ if (dev->msi_enabled)
+ status = -EALREADY;
+ else if (dev->msix_enabled)
+ status = -ENXIO;
+ else
+ status = pci_enable_msi(dev);
if (status) {
pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n",
int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
struct pci_dev *dev, struct xen_pci_op *op)
{
- struct xen_pcibk_dev_data *dev_data;
-
if (unlikely(verbose_request))
printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
pci_name(dev));
- pci_disable_msi(dev);
+ if (dev->msi_enabled) {
+ struct xen_pcibk_dev_data *dev_data;
+
+ pci_disable_msi(dev);
+
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ }
op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
if (unlikely(verbose_request))
printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
op->value);
- dev_data = pci_get_drvdata(dev);
- if (dev_data)
- dev_data->ack_intr = 1;
return 0;
}
struct xen_pcibk_dev_data *dev_data;
int i, result;
struct msix_entry *entries;
+ u16 cmd;
if (unlikely(verbose_request))
printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n",
pci_name(dev));
+
if (op->value > SH_INFO_MAX_VEC)
return -EINVAL;
+ if (dev->msix_enabled)
+ return -EALREADY;
+
+ /*
+ * PCI_COMMAND_MEMORY must be enabled, otherwise we may not be able
+ * to access the BARs where the MSI-X entries reside.
+ */
+ pci_read_config_word(dev, PCI_COMMAND, &cmd);
+ if (dev->msi_enabled || !(cmd & PCI_COMMAND_MEMORY))
+ return -ENXIO;
+
entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
if (entries == NULL)
return -ENOMEM;
int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
struct pci_dev *dev, struct xen_pci_op *op)
{
- struct xen_pcibk_dev_data *dev_data;
if (unlikely(verbose_request))
printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
pci_name(dev));
- pci_disable_msix(dev);
+ if (dev->msix_enabled) {
+ struct xen_pcibk_dev_data *dev_data;
+
+ pci_disable_msix(dev);
+
+ dev_data = pci_get_drvdata(dev);
+ if (dev_data)
+ dev_data->ack_intr = 1;
+ }
/*
* SR-IOV devices (which don't have any legacy IRQ) have
* an undefined IRQ value of zero.
*/
op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
if (unlikely(verbose_request))
- printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev),
- op->value);
- dev_data = pci_get_drvdata(dev);
- if (dev_data)
- dev_data->ack_intr = 1;
+ printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n",
+ pci_name(dev), op->value);
return 0;
}
#endif
container_of(data, struct xen_pcibk_device, op_work);
struct pci_dev *dev;
struct xen_pcibk_dev_data *dev_data = NULL;
- struct xen_pci_op *op = &pdev->sh_info->op;
+ struct xen_pci_op *op = &pdev->op;
int test_intx = 0;
+ *op = pdev->sh_info->op;
+ barrier();
dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
if (dev == NULL)
if ((dev_data->enable_intx != test_intx))
xen_pcibk_control_isr(dev, 0 /* no reset */);
}
+ pdev->sh_info->op.err = op->err;
+ pdev->sh_info->op.value = op->value;
+#ifdef CONFIG_PCI_MSI
+ if (op->cmd == XEN_PCI_OP_enable_msix && op->err == 0) {
+ unsigned int i;
+
+ for (i = 0; i < op->value; i++)
+ pdev->sh_info->op.msix_entries[i].vector =
+ op->msix_entries[i].vector;
+ }
+#endif
/* Tell the driver domain that we're done. */
wmb();
clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
pdev->xdev = xdev;
- dev_set_drvdata(&xdev->dev, pdev);
mutex_init(&pdev->dev_lock);
kfree(pdev);
pdev = NULL;
}
+
+ dev_set_drvdata(&xdev->dev, pdev);
+
out:
return pdev;
}
if (!pending_req)
return 1;
- ring_req = *RING_GET_REQUEST(ring, rc);
+ RING_COPY_REQUEST(ring, rc, &ring_req);
ring->req_cons = ++rc;
err = prepare_pending_reqs(info, &ring_req, pending_req);
return 0;
}
/* get the default/access acl values and cache them */
- dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
- pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+ dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+ pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
goto err_free_out;
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
struct posix_acl *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* We allow set/get/list of acl when access=client is not specified
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_get(dentry, handler->prefix, buffer, size);
+ return v9fs_xattr_get(dentry, handler->name, buffer, size);
acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
if (IS_ERR(acl))
struct v9fs_session_info *v9ses;
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* set the attribute on the remote. Without even looking at the
* xattr value. We leave it to the server to validate
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_xattr_set(dentry, handler->prefix, value, size,
+ return v9fs_xattr_set(dentry, handler->name, value, size,
flags);
if (S_ISLNK(inode->i_mode))
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags);
+ retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
if (!retval)
set_cached_acl(inode, handler->flags, acl);
err_out:
}
const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
};
const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
}
/**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: delayed call for when we are done with the return value
*/
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
struct p9_wstat *st;
char *res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ fid = v9fs_fid_lookup(dentry);
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (IS_ERR(fid))
p9stat_free(st);
kfree(st);
- return *cookie = res;
+ set_delayed_call(done, kfree_link, res);
+ return res;
}
/**
static const struct inode_operations v9fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
}
/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: destructor for return value
*/
static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct p9_fid *fid;
char *target;
int retval;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
if (retval)
return ERR_PTR(retval);
- return *cookie = target;
+ set_delayed_call(done, kfree_link, target);
+ return target;
}
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link_dotl,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.setxattr = generic_setxattr,
{
const char *full_name = xattr_full_name(handler, name);
- if (strcmp(name, "") == 0)
- return -EINVAL;
return v9fs_xattr_get(dentry, full_name, buffer, size);
}
{
const char *full_name = xattr_full_name(handler, name);
- if (strcmp(name, "") == 0)
- return -EINVAL;
return v9fs_xattr_set(dentry, full_name, value, size, flags);
}
break;
case ST_SOFTLINK:
inode->i_mode |= S_IFLNK;
+ inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
inode->i_data.a_ops = &affs_symlink_aops;
break;
return -ENOSPC;
inode->i_op = &affs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &affs_symlink_aops;
inode->i_mode = S_IFLNK | 0777;
mode_to_prot(inode);
{
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- char *link = kmap(page);
+ char *link = page_address(page);
struct slink_front *lf;
int i, j;
char c;
char lc;
- pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+ pr_debug("get_link(ino=%lu)\n", inode->i_ino);
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
link[i] = '\0';
affs_brelse(bh);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
const struct inode_operations affs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = affs_notify_change,
};
case AFS_FTYPE_SYMLINK:
inode->i_mode = S_IFLNK | vnode->status.mode;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
default:
printk("kAFS: AFS vnode with undefined type\n");
#include "autofs_i.h"
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi;
+ struct autofs_info *ino;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ sbi = autofs4_sbi(dentry->d_sb);
+ ino = autofs4_dentry_ino(dentry);
if (ino && !autofs4_oz_mode(sbi))
ino->last_used = jiffies;
return d_inode(dentry)->i_private;
const struct inode_operations autofs4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = autofs4_follow_link
+ .get_link = autofs4_get_link
};
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
.bmap = befs_bmap,
};
-static const struct inode_operations befs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = befs_follow_link,
- .put_link = kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+ .readpage = befs_symlink_readpage,
};
/*
inode->i_fop = &befs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
- inode->i_op = &befs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &befs_symlink_aops;
} else {
inode->i_link = befs_ino->i_data.symlink;
inode->i_op = &simple_symlink_inode_operations;
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
{
- struct super_block *sb = dentry->d_sb;
- struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link;
+ char *link = page_address(page);
- if (len == 0) {
+ if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
- return ERR_PTR(-EIO);
+ goto fail;
}
befs_debug(sb, "Follow long symlink");
- link = kmalloc(len, GFP_NOFS);
- if (!link)
- return ERR_PTR(-ENOMEM);
if (befs_read_lsymlink(sb, data, link, len) != len) {
- kfree(link);
befs_error(sb, "Failed to read entire long symlink");
- return ERR_PTR(-EIO);
+ goto fail;
}
link[len - 1] = '\0';
- return *cookie = link;
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+fail:
+ SetPageError(page);
+ unlock_page(page);
+ return -EIO;
}
/*
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EINVAL : 0;
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
* until transaction commit to do the actual discard.
*/
if (trimming) {
- WARN_ON(!list_empty(&block_group->bg_list));
- spin_lock(&trans->transaction->deleted_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
- spin_unlock(&trans->transaction->deleted_bgs_lock);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
unlock_page(page);
return -EIO;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
}
return 0;
}
int faili;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos,
+ err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (i == num_pages - 1)
- err = prepare_uptodate_page(pages[i],
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
page_cache_release(pages[i]);
+ if (err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
faili = i - 1;
goto fail;
}
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now",
block_group->key.objectid);
}
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
+ struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
- entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
- if (entry->offset != bitmap_offset) {
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
- spin_lock_init(&cur_trans->deleted_bgs_lock);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
- spinlock_t deleted_bgs_lock;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
ret = btrfs_force_chunk_alloc(trans, chunk_root,
BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans, chunk_root);
if (ret < 0) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
}
-
- btrfs_end_transaction(trans, chunk_root);
chunk_reserved = 1;
}
return ret;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- int len = strlen(name);
- int prefixlen = 0;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- prefixlen = XATTR_SECURITY_PREFIX_LEN;
- else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- prefixlen = XATTR_SYSTEM_PREFIX_LEN;
- else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- prefixlen = XATTR_TRUSTED_PREFIX_LEN;
- else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- prefixlen = XATTR_USER_PREFIX_LEN;
- else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- prefixlen = XATTR_BTRFS_PREFIX_LEN;
- else
- return -EOPNOTSUPP;
-
- /*
- * The name cannot consist of just prefix
- */
- if (len <= prefixlen)
- return -EINVAL;
+ struct inode *inode = d_inode(dentry);
- return 0;
+ name = xattr_full_name(handler, name);
+ return __btrfs_getxattr(inode, name, buffer, size);
}
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size,
+ int flags)
{
- int ret;
+ struct inode *inode = d_inode(dentry);
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
- return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
}
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
- flags);
+ return generic_setxattr(dentry, name, value, size, flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- NULL, 0, XATTR_REPLACE);
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
- XATTR_REPLACE);
+ return generic_removexattr(dentry, name);
}
static int btrfs_initxattrs(struct inode *inode,
extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
extern int btrfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &new_mode);
if (ret < 0)
ret = acl ? -EINVAL : 0;
goto out;
}
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
ret = -EINVAL;
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
if (acl) {
- size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
- ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
len);
err = posix_acl_to_xattr(&init_user_ns, acl,
tmp_buf, val_size1);
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
if (default_acl) {
- size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
err = ceph_pagelist_encode_string(pagelist,
- POSIX_ACL_XATTR_DEFAULT, len);
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
*/
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.setxattr = ceph_setxattr,
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = cifs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = cifs_get_link,
.permission = cifs_permission,
/* BB add the following two eventually */
/* revalidate: cifs_revalidate,
#endif
/* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
- int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
const char *symname);
extern int cifs_removexattr(struct dentry *, const char *);
}
const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(direntry);
int rc = -ENOMEM;
unsigned int xid;
char *full_path = NULL;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+ if (!direntry)
+ return ERR_PTR(-ECHILD);
+
xid = get_xid();
tlink = cifs_sb_tlink(cifs_sb);
kfree(target_path);
return ERR_PTR(rc);
}
- return *cookie = target_path;
+ set_delayed_call(done, kfree_link, target_path);
+ return target_path;
}
int
#endif /* CONFIG_CIFS_ACL */
} else {
int temp;
- temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
+ temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
if (temp == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
#else
cifs_dbg(FYI, "set POSIX ACL not supported\n");
#endif
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
#else
cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
#include <linux/coda.h>
#include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
#include "coda_linux.h"
static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
static const struct inode_operations coda_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = coda_setattr,
};
inode->i_fop = &coda_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &coda_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &coda_symlink_aops;
inode->i_mapping = &inode->i_data;
} else
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = kmap(page);
+ char *p = page_address(page);
cii = ITOC(inode);
if (error)
goto fail;
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return error;
}
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include "internal.h"
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_sock.h>
#include <net/bluetooth/rfcomm.h>
#include <asm/fbio.h>
#endif
-static int w_long(unsigned int fd, unsigned int cmd,
- compat_ulong_t __user *argp)
+#define convert_in_user(srcptr, dstptr) \
+({ \
+ typeof(*srcptr) val; \
+ \
+ get_user(val, srcptr) || put_user(val, dstptr); \
+})
+
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ err = security_file_ioctl(file, cmd, arg);
+ if (err)
+ return err;
+
+ return vfs_ioctl(file, cmd, arg);
+}
+
+static int w_long(struct file *file,
+ unsigned int cmd, compat_ulong_t __user *argp)
{
- mm_segment_t old_fs = get_fs();
int err;
- unsigned long val;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
- set_fs (KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long)&val);
- set_fs (old_fs);
- if (!err && put_user(val, argp))
+ if (valp == NULL)
return -EFAULT;
- return err;
+ err = do_ioctl(file, cmd, (unsigned long)valp);
+ if (err)
+ return err;
+ if (convert_in_user(valp, argp))
+ return -EFAULT;
+ return 0;
}
struct compat_video_event {
} u;
};
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
- struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file,
+ unsigned int cmd, struct compat_video_event __user *up)
{
- struct video_event kevent;
- mm_segment_t old_fs = get_fs();
+ struct video_event __user *kevent =
+ compat_alloc_user_space(sizeof(*kevent));
int err;
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
- set_fs(old_fs);
+ if (kevent == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, cmd, (unsigned long)kevent);
if (!err) {
- err = put_user(kevent.type, &up->type);
- err |= put_user(kevent.timestamp, &up->timestamp);
- err |= put_user(kevent.u.size.w, &up->u.size.w);
- err |= put_user(kevent.u.size.h, &up->u.size.h);
- err |= put_user(kevent.u.size.aspect_ratio,
+ err = convert_in_user(&kevent->type, &up->type);
+ err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+ err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+ err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+ err |= convert_in_user(&kevent->u.size.aspect_ratio,
&up->u.size.aspect_ratio);
if (err)
err = -EFAULT;
int32_t size;
};
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
- struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file,
+ unsigned int cmd, struct compat_video_still_picture __user *up)
{
struct video_still_picture __user *up_native;
compat_uptr_t fp;
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
compat_uptr_t palette;
};
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
- struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file,
+ unsigned int cmd, struct compat_video_spu_palette __user *up)
{
struct video_spu_palette __user *up_native;
compat_uptr_t palp;
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
return 0;
}
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
sg_io_hdr32_t __user *sgio32)
{
sg_io_hdr_t __user *sgio;
if (get_user(interface_id, &sgio32->interface_id))
return -EFAULT;
if (interface_id != 'S')
- return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+ return do_ioctl(file, cmd, (unsigned long)sgio32);
if (get_user(iovec_count, &sgio32->iovec_count))
return -EFAULT;
if (put_user(compat_ptr(data), &sgio->usr_ptr))
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+ err = do_ioctl(file, cmd, (unsigned long) sgio);
if (err >= 0) {
void __user *datap;
int unused;
};
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
- compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file,
+ unsigned int cmd, struct compat_sg_req_info __user *o)
{
int err, i;
sg_req_info_t __user *r;
r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = sys_ioctl(fd,cmd,(unsigned long)r);
+ err = do_ioctl(file, cmd, (unsigned long)r);
if (err < 0)
return err;
for (i = 0; i < SG_MAX_QUEUE; i++) {
#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
- struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
+ unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
{
struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
void __user *fptr64;
else
cmd = PPPIOCSACTIVE;
- return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+ return do_ioctl(file, cmd, (unsigned long) u_fprog64);
}
struct ppp_option_data32 {
};
#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
struct ppp_idle32 __user *idle32)
{
struct ppp_idle __user *idle;
idle = compat_alloc_user_space(sizeof(*idle));
- err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+ err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
if (!err) {
if (get_user(xmit, &idle->xmit_idle) ||
return err;
}
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
struct ppp_option_data32 __user *odata32)
{
struct ppp_option_data __user *odata;
sizeof(__u32) + sizeof(int)))
return -EFAULT;
- return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+ return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
}
#ifdef CONFIG_BLOCK
};
#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file,
+ unsigned int cmd, void __user *argp)
{
- mm_segment_t old_fs = get_fs();
- struct mtget get;
+ /* NULL initialization to make gcc shut up */
+ struct mtget __user *get = NULL;
struct mtget32 __user *umget32;
- struct mtpos pos;
+ struct mtpos __user *pos = NULL;
struct mtpos32 __user *upos32;
unsigned long kcmd;
void *karg;
switch(cmd) {
case MTIOCPOS32:
kcmd = MTIOCPOS;
- karg = &pos;
+ pos = compat_alloc_user_space(sizeof(*pos));
+ karg = pos;
break;
default: /* MTIOCGET32 */
kcmd = MTIOCGET;
- karg = &get;
+ get = compat_alloc_user_space(sizeof(*get));
+ karg = get;
break;
}
- set_fs (KERNEL_DS);
- err = sys_ioctl (fd, kcmd, (unsigned long)karg);
- set_fs (old_fs);
+ if (karg == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, kcmd, (unsigned long)karg);
if (err)
return err;
switch (cmd) {
case MTIOCPOS32:
upos32 = argp;
- err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+ err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
break;
case MTIOCGET32:
umget32 = argp;
- err = __put_user(get.mt_type, &umget32->mt_type);
- err |= __put_user(get.mt_resid, &umget32->mt_resid);
- err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
- err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
- err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
- err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
- err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+ err = convert_in_user(&get->mt_type, &umget32->mt_type);
+ err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+ err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+ err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+ err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+ err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+ err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
break;
}
return err ? -EFAULT: 0;
compat_int_t reserved[1];
};
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
- struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file,
+ unsigned cmd, struct serial_struct32 __user *ss32)
{
typedef struct serial_struct32 SS32;
int err;
- struct serial_struct ss;
- mm_segment_t oldseg = get_fs();
+ struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
__u32 udata;
unsigned int base;
+ unsigned char *iomem_base;
+ if (ss == NULL)
+ return -EFAULT;
if (cmd == TIOCSSERIAL) {
- if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
+ if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+ get_user(udata, &ss32->iomem_base))
return -EFAULT;
- if (__get_user(udata, &ss32->iomem_base))
+ iomem_base = compat_ptr(udata);
+ if (put_user(iomem_base, &ss->iomem_base) ||
+ convert_in_user(&ss32->iomem_reg_shift,
+ &ss->iomem_reg_shift) ||
+ convert_in_user(&ss32->port_high, &ss->port_high) ||
+ put_user(0UL, &ss->iomap_base))
return -EFAULT;
- ss.iomem_base = compat_ptr(udata);
- if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __get_user(ss.port_high, &ss32->port_high))
- return -EFAULT;
- ss.iomap_base = 0UL;
}
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
- set_fs(oldseg);
+ err = do_ioctl(file, cmd, (unsigned long)ss);
if (cmd == TIOCGSERIAL && err >= 0) {
- if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+ if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+ get_user(iomem_base, &ss->iomem_base))
return -EFAULT;
- base = (unsigned long)ss.iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
- if (__put_user(base, &ss32->iomem_base) ||
- __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __put_user(ss.port_high, &ss32->port_high))
+ base = (unsigned long)iomem_base >> 32 ?
+ 0xffffffff : (unsigned)(unsigned long)iomem_base;
+ if (put_user(base, &ss32->iomem_base) ||
+ convert_in_user(&ss->iomem_reg_shift,
+ &ss32->iomem_reg_shift) ||
+ convert_in_user(&ss->port_high, &ss32->port_high))
return -EFAULT;
}
return err;
struct i2c_msg msgs[0];
};
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_rdwr_ioctl_data32 __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
{
struct i2c_rdwr_aligned __user *tdata;
struct i2c_msg __user *tmsgs;
put_user(compat_ptr(datap), &tmsgs[i].buf))
return -EFAULT;
}
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_smbus_ioctl_data32 __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
compat_caddr_t datap;
__put_user(compat_ptr(datap), &tdata->data))
return -EFAULT;
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file,
+ unsigned cmd, void __user *argp)
{
- mm_segment_t oldfs = get_fs();
- compat_ulong_t val32;
- unsigned long kval;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
int ret;
+ if (valp == NULL)
+ return -EFAULT;
switch (cmd) {
case RTC_IRQP_READ32:
case RTC_EPOCH_READ32:
- set_fs(KERNEL_DS);
- ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+ ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)&kval);
- set_fs(oldfs);
+ (unsigned long)valp);
if (ret)
return ret;
- val32 = kval;
- return put_user(val32, (unsigned int __user *)argp);
+ return convert_in_user(valp, (unsigned int __user *)argp);
case RTC_IRQP_SET32:
- return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
case RTC_EPOCH_SET32:
- return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
}
return -ENOIOCTLCMD;
* a compat_ioctl operation in the place that handleѕ the
* ioctl for the native case.
*/
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
void __user *argp = compat_ptr(arg);
switch (cmd) {
case PPPIOCGIDLE32:
- return ppp_gidle(fd, cmd, argp);
+ return ppp_gidle(file, cmd, argp);
case PPPIOCSCOMPRESS32:
- return ppp_scompress(fd, cmd, argp);
+ return ppp_scompress(file, cmd, argp);
case PPPIOCSPASS32:
case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+ return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
#ifdef CONFIG_BLOCK
case SG_IO:
- return sg_ioctl_trans(fd, cmd, argp);
+ return sg_ioctl_trans(file, cmd, argp);
case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(fd, cmd, argp);
+ return sg_grt_trans(file, cmd, argp);
case MTIOCGET32:
case MTIOCPOS32:
- return mt_ioctl_trans(fd, cmd, argp);
+ return mt_ioctl_trans(file, cmd, argp);
#endif
/* Serial */
case TIOCGSERIAL:
case TIOCSSERIAL:
- return serial_struct_ioctl(fd, cmd, argp);
+ return serial_struct_ioctl(file, cmd, argp);
/* i2c */
case I2C_FUNCS:
- return w_long(fd, cmd, argp);
+ return w_long(file, cmd, argp);
case I2C_RDWR:
- return do_i2c_rdwr_ioctl(fd, cmd, argp);
+ return do_i2c_rdwr_ioctl(file, cmd, argp);
case I2C_SMBUS:
- return do_i2c_smbus_ioctl(fd, cmd, argp);
+ return do_i2c_smbus_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
- return rtc_ioctl(fd, cmd, argp);
+ return rtc_ioctl(file, cmd, argp);
/* dvb */
case VIDEO_GET_EVENT:
- return do_video_get_event(fd, cmd, argp);
+ return do_video_get_event(file, cmd, argp);
case VIDEO_STILLPICTURE:
- return do_video_stillpicture(fd, cmd, argp);
+ return do_video_stillpicture(file, cmd, argp);
case VIDEO_SET_SPU_PALETTE:
- return do_video_set_spu_palette(fd, cmd, argp);
+ return do_video_set_spu_palette(file, cmd, argp);
}
/*
case NBD_SET_BLKSIZE:
case NBD_SET_SIZE:
case NBD_SET_SIZE_BLOCKS:
- return do_vfs_ioctl(file, fd, cmd, arg);
+ return vfs_ioctl(file, cmd, arg);
}
return -ENOIOCTLCMD;
if (compat_ioctl_check_table(XFORM(cmd)))
goto found_handler;
- error = do_ioctl_trans(fd, cmd, arg, f.file);
+ error = do_ioctl_trans(cmd, arg, f.file);
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
}
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ char *body;
int error;
- if (!page)
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = configfs_getlink(dentry, (char *)page);
+ error = configfs_getlink(dentry, body);
if (!error) {
- return *cookie = (void *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
- free_page(page);
+ kfree(body);
return ERR_PTR(error);
}
const struct inode_operations configfs_symlink_inode_operations = {
- .follow_link = configfs_follow_link,
+ .get_link = configfs_get_link,
.readlink = generic_readlink,
- .put_link = free_page_put_link,
.setattr = configfs_setattr,
};
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
default:
}
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (unlikely(inode->i_op->follow_link)) {
+ if (unlikely(inode->i_op->get_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
goto type_determined;
}
return rc ? ERR_PTR(rc) : buf;
}
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
size_t len;
- char *buf = ecryptfs_readlink_lower(dentry, &len);
+ char *buf;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ buf = ecryptfs_readlink_lower(dentry, &len);
if (IS_ERR(buf))
return buf;
fsstack_copy_attr_atime(d_inode(dentry),
d_inode(ecryptfs_dentry_to_lower(dentry)));
buf[len] = '\0';
- return *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
+ return buf;
}
/**
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = ecryptfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr_link,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &efs_symlink_aops;
break;
case S_IFCHR:
static int efs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
efs_block_t size = inode->i_size;
}
link[size] = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
inode->i_link = (char *)oi->i_data;
} else {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
}
} else {
if (l > sizeof(oi->i_data)) {
/* slow symlink */
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
memset(oi->i_data, 0, sizeof(oi->i_data));
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
const struct inode_operations ext2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
const struct inode_operations ext2_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
const struct xattr_handler *handler =
ext2_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(handler, dentry, buffer,
- rest, entry->e_name,
- entry->e_name_len);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest) {
error = -ERANGE;
goto cleanup;
}
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
#include <linux/security.h>
#include "xattr.h"
-static size_t
-ext2_xattr_security_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
ext2_xattr_security_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext2_xattr_security_list,
.get = ext2_xattr_security_get,
.set = ext2_xattr_security_set,
};
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_trusted_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
{
- const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_user_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
+ inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
if (!encryption_required)
inode->i_op = &ext4_symlink_inode_operations;
+ inode_nohighmem(inode);
ext4_set_aops(inode);
/*
* We cannot call page_symlink() with transaction started
#include "xattr.h"
#ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
struct ext4_str cstr, pstr;
- struct inode *inode = d_inode(dentry);
struct ext4_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
u32 plen, max_size = inode->i_sb->s_blocksize;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = ext4_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
}
/* Null-terminate the name */
if (res <= plen)
paddr[res] = '\0';
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
- if (cpage) {
- kunmap(cpage);
+ if (cpage)
page_cache_release(cpage);
- }
kfree(paddr);
return ERR_PTR(res);
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(handler, dentry, buffer,
- rest, entry->e_name,
- entry->e_name_len);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest)
return -ERANGE;
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
}
- return buffer_size - rest;
+ return buffer_size - rest; /* total size */
}
static int
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_security_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
-{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
- const size_t total_len = prefix_len + name_len + 1;
-
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
ext4_xattr_security_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
.set = ext4_xattr_security_set,
};
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_trusted_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
struct dentry *dentry, const char *name, void *buffer,
size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_user_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
return err;
}
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- const char *link = page_follow_link_light(dentry, cookie);
+ const char *link = page_get_link(dentry, inode, done);
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
- page_put_link(NULL, *cookie);
+ do_delayed_call(done);
+ clear_delayed_call(done);
link = ERR_PTR(-ENOENT);
}
return link;
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
f2fs_lock_op(sbi);
}
#ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
struct f2fs_str cstr;
struct f2fs_str pstr = FSTR_INIT(NULL, 0);
- struct inode *inode = d_inode(dentry);
struct f2fs_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = f2fs_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
/* Symlink is encrypted */
/* Null-terminate the name */
paddr[res] = '\0';
- kunmap(cpage);
page_cache_release(cpage);
- return *cookie = paddr;
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
kfree(cstr.name);
f2fs_fname_crypto_free_buffer(&pstr);
- kunmap(cpage);
page_cache_release(cpage);
return ERR_PTR(res);
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.setxattr = generic_setxattr,
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_follow_link,
- .put_link = page_put_link,
+ .get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
#include "f2fs.h"
#include "xattr.h"
-static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t len)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- int total_len, prefix_len;
-
- switch (handler->flags) {
- case F2FS_XATTR_INDEX_USER:
- if (!test_opt(sbi, XATTR_USER))
- return -EOPNOTSUPP;
- break;
- case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
- case F2FS_XATTR_INDEX_SECURITY:
- break;
- default:
- return -EINVAL;
- }
-
- prefix_len = strlen(handler->prefix);
- total_len = prefix_len + len + 1;
- if (list && total_len <= list_size) {
- memcpy(list, handler->prefix, prefix_len);
- memcpy(list + prefix_len, name, len);
- list[prefix_len + len] = '\0';
- }
- return total_len;
-}
-
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name, void *buffer,
size_t size)
default:
return -EINVAL;
}
- if (strcmp(name, "") == 0)
- return -EINVAL;
return f2fs_getxattr(d_inode(dentry), handler->flags, name,
buffer, size, NULL);
}
default:
return -EINVAL;
}
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return f2fs_setxattr(d_inode(dentry), handler->flags, name,
value, size, NULL, flags);
}
-static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t len)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
{
- const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
- size_t size;
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return test_opt(sbi, XATTR_USER);
+}
+
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
+{
+ return capable(CAP_SYS_ADMIN);
}
static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
if (buffer)
*((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_user_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
const struct xattr_handler f2fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = F2FS_XATTR_INDEX_TRUSTED,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_trusted_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
const struct xattr_handler f2fs_xattr_advise_handler = {
- .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
- .list = f2fs_xattr_advise_list,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
};
const struct xattr_handler f2fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = F2FS_XATTR_INDEX_SECURITY,
- .list = f2fs_xattr_generic_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
+ const char *prefix;
+ size_t prefix_len;
size_t size;
- if (!handler)
+ if (!handler || (handler->list && !handler->list(dentry)))
continue;
- size = handler->list(handler, dentry, buffer, rest,
- entry->e_name, entry->e_name_len);
- if (buffer && size > rest) {
- error = -ERANGE;
- goto cleanup;
+ prefix = handler->prefix ?: handler->name;
+ prefix_len = strlen(prefix);
+ size = prefix_len + entry->e_name_len + 1;
+ if (buffer) {
+ if (size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
-
- if (buffer)
- buffer += size;
rest -= size;
}
error = buffer_size - rest;
#define F2FS_XATTR_REFCOUNT_MAX 1024
/* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME "system.advise"
#define F2FS_XATTR_INDEX_USER 1
#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
} else if (S_ISLNK(ip->i_mode)) {
if (!VXFS_ISIMMED(vip)) {
ip->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &vxfs_aops;
} else {
ip->i_op = &simple_symlink_inode_operations;
return err;
}
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
char *link;
ssize_t ret;
- link = (char *) __get_free_page(GFP_KERNEL);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ link = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!link)
return ERR_PTR(-ENOMEM);
args.out.args[0].value = link;
ret = fuse_simple_request(fc, &args);
if (ret < 0) {
- free_page((unsigned long) link);
+ kfree(link);
link = ERR_PTR(ret);
} else {
link[ret] = '\0';
- *cookie = link;
+ set_delayed_call(done, kfree_link, link);
}
fuse_invalidate_atime(inode);
return link;
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
- .follow_link = fuse_follow_link,
- .put_link = free_page_put_link,
+ .get_link = fuse_get_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
.setxattr = fuse_setxattr,
{
switch (type) {
case ACL_TYPE_ACCESS:
- return GFS2_POSIX_ACL_ACCESS;
+ return XATTR_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
- return GFS2_POSIX_ACL_DEFAULT;
+ return XATTR_POSIX_ACL_DEFAULT;
}
return NULL;
}
#include "incore.h"
-#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
}
/**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
* @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @done: destructor for return value
*
* This can handle symlinks of any size.
*
* Returns: 0 on success or error code
*/
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
struct buffer_head *dibh;
unsigned int size;
char *buf;
int error;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
error = gfs2_glock_nq(&i_gh);
if (error) {
out:
gfs2_glock_dq_uninit(&i_gh);
if (!IS_ERR(buf))
- *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
return buf;
}
const struct inode_operations gfs2_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = gfs2_follow_link,
- .put_link = kfree_put_link,
+ .get_link = gfs2_get_link,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
size, flags, handler->flags);
}
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
- struct gfs2_ea_header *ea, char *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int amount = GFS2_EA_DATA_LEN(ea);
- unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
- int ret;
-
- ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
- if (ret)
- return ret;
-
- ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
- gfs2_trans_end(sdp);
-
- return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_ea_location el;
- int error;
-
- error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
- if (error)
- return error;
-
- if (GFS2_EA_IS_STUFFED(el.el_ea)) {
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
- if (error == 0) {
- gfs2_trans_add_meta(ip->i_gl, el.el_bh);
- memcpy(GFS2_EA2DATA(el.el_ea), data,
- GFS2_EA_DATA_LEN(el.el_ea));
- }
- } else {
- error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
- }
-
- brelse(el.el_bh);
- if (error)
- return error;
-
- error = gfs2_setattr_simple(inode, attr);
- gfs2_trans_end(sdp);
- return error;
-}
-
static int ea_dealloc_indirect(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
/* Exported to acl.c */
extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
} else if (S_ISLNK(inode->i_mode)) {
sbi->file_count++;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
hip->clump_blocks = 1;
} else
inode->i_mapping->a_ops = &hfsplus_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
} else {
init_special_inode(inode, inode->i_mode,
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
err = posix_acl_equiv_mode(acl, &inode->i_mode);
if (err < 0)
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
char *xattr_name;
int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
int res;
char *xattr_name;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow retrieving properly prefixed attributes
* by prepending them with "osx."
struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow setting properly prefixed attributes
* by prepending them with "osx."
.setattr = hostfs_setattr,
};
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- char *link = __getname();
+ char *link;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ link = kmalloc(PATH_MAX, GFP_KERNEL);
if (link) {
char *path = dentry_name(dentry);
int err = -ENOMEM;
__putname(path);
}
if (err < 0) {
- __putname(link);
+ kfree(link);
return ERR_PTR(err);
}
} else {
return ERR_PTR(-ENOMEM);
}
- return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
- __putname(cookie);
+ set_delayed_call(done, kfree_link, link);
+ return link;
}
static const struct inode_operations hostfs_link_iops = {
.readlink = generic_readlink,
- .follow_link = hostfs_follow_link,
- .put_link = hostfs_put_link,
+ .get_link = hostfs_get_link,
};
static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
kfree(ea);
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &hpfs_symlink_aops;
set_nlink(i, 1);
i->i_size = ea_size;
result->i_blocks = 1;
set_nlink(result, 1);
result->i_size = strlen(symlink);
+ inode_nohighmem(result);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
static int hpfs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
goto fail;
hpfs_unlock(i->i_sb);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
hpfs_unlock(i->i_sb);
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
lockdep_annotate_inode_mutex_key(inode);
new_flags) != old_flags));
}
EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+ mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
* fs/nsfs.c
*/
extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+ unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
+#include "internal.h"
#include <asm/ioctls.h>
*
* Returns 0 on success, -errno on error.
*/
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
inode->i_fop = &isofs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
} else
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = kmap(page);
+ char *link = page_address(page);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
brelse(bh);
*rpnt = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
brelse(bh);
error:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
-static size_t jffs2_security_listxattr(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_SECURITY_PREFIX);
- strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = jffs2_security_listxattr,
.set = jffs2_security_setxattr,
.get = jffs2_security_getxattr
};
const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
struct jffs2_xattr_ref *ref, **pref;
struct jffs2_xattr_datum *xd;
const struct xattr_handler *xhandle;
- ssize_t len, rc;
+ const char *prefix;
+ ssize_t prefix_len, len, rc;
int retry = 0;
rc = check_xattr_ref_inode(c, ic);
}
}
xhandle = xprefix_to_handler(xd->xprefix);
- if (!xhandle)
+ if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
continue;
+ prefix = xhandle->prefix ?: xhandle->name;
+ prefix_len = strlen(prefix);
+ rc = prefix_len + xd->name_len + 1;
+
if (buffer) {
- rc = xhandle->list(xhandle, dentry, buffer + len,
- size - len, xd->xname,
- xd->name_len);
- } else {
- rc = xhandle->list(xhandle, dentry, NULL, 0,
- xd->xname, xd->name_len);
+ if (rc > size - len) {
+ rc = -ERANGE;
+ goto out;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, xd->xname, xd->name_len);
+ buffer += xd->name_len;
+ *buffer++ = 0;
}
- if (rc < 0)
- goto out;
len += rc;
}
rc = len;
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
-static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
{
- size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && retlen<=list_size) {
- strcpy(list, XATTR_TRUSTED_PREFIX);
- strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
- }
-
- return retlen;
+ return capable(CAP_SYS_ADMIN);
}
const struct xattr_handler jffs2_trusted_xattr_handler = {
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
-static size_t jffs2_user_listxattr(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_USER_PREFIX);
- strcpy(list + XATTR_USER_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = jffs2_user_listxattr,
.set = jffs2_user_setxattr,
.get = jffs2_user_getxattr
};
switch(type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
rc = posix_acl_equiv_mode(acl, &inode->i_mode);
if (rc < 0)
}
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
} else if (S_ISLNK(inode->i_mode)) {
if (inode->i_size >= IDATASIZE) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &jfs_aops;
} else {
inode->i_op = &jfs_fast_symlink_inode_operations;
jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
ip->i_op = &jfs_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &jfs_aops;
/*
const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
if (!attrs)
return -ENOMEM;
- return simple_xattr_remove(&attrs->xattrs, name);
+ return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
}
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
if (!attrs)
return -ENOMEM;
- return simple_xattr_list(&attrs->xattrs, buf, size);
+ return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
return error;
}
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- int error = -ENOMEM;
- unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (!page)
+ char *body;
+ int error;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = kernfs_getlink(dentry, (char *)page);
+ error = kernfs_getlink(dentry, body);
if (unlikely(error < 0)) {
- free_page((unsigned long)page);
+ kfree(body);
return ERR_PTR(error);
}
- return *cookie = (char *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
const struct inode_operations kernfs_symlink_iops = {
.getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
- .follow_link = kernfs_iop_follow_link,
- .put_link = free_page_put_link,
+ .get_link = kernfs_iop_get_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.permission = kernfs_iop_permission,
}
EXPORT_SYMBOL(noop_fsync);
-void kfree_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
{
- kfree(cookie);
+ kfree(p);
}
-EXPORT_SYMBOL(kfree_put_link);
-
-void free_page_put_link(struct inode *unused, void *cookie)
-{
- free_page((unsigned long) cookie);
-}
-EXPORT_SYMBOL(free_page_put_link);
+EXPORT_SYMBOL(kfree_link);
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
}
EXPORT_SYMBOL(simple_nosetlease);
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
{
- return d_inode(dentry)->i_link;
+ return inode->i_link;
}
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
return __logfs_create(dir, dentry, inode, target, destlen);
return -EIO;
}
-const struct inode_operations logfs_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
-
const struct inode_operations logfs_dir_iops = {
.create = logfs_create,
.link = logfs_link,
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFLNK:
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFSOCK: /* fall through */
#endif
/* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
extern const struct inode_operations logfs_dir_iops;
extern const struct file_operations logfs_dir_fops;
int logfs_replay_journal(struct super_block *sb);
static const struct inode_operations minix_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = minix_getattr,
};
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &minix_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
int total_link_count;
struct saved {
struct path link;
- void *cookie;
+ struct delayed_call done;
const char *name;
- struct inode *inode;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
+ struct inode *link_inode;
unsigned root_seq;
int dfd;
};
int i = nd->depth;
while (i--) {
struct saved *last = nd->stack + i;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link) {
- inode->i_op->put_link(inode, last->cookie);
- last->cookie = NULL;
- }
+ do_delayed_call(&last->done);
+ clear_delayed_call(&last->done);
}
}
}
/*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
void nd_jump_link(struct path *path)
static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link)
- inode->i_op->put_link(inode, last->cookie);
+ do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
path_put(&last->link);
}
return 0;
/* Allowed if owner and follower match. */
- inode = nd->stack[0].inode;
+ inode = nd->link_inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
{
struct saved *last = nd->stack + nd->depth - 1;
struct dentry *dentry = last->link.dentry;
- struct inode *inode = last->inode;
+ struct inode *inode = nd->link_inode;
int error;
const char *res;
nd->last_type = LAST_BIND;
res = inode->i_link;
if (!res) {
+ const char * (*get)(struct dentry *, struct inode *,
+ struct delayed_call *);
+ get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, NULL, 0)))
- return ERR_PTR(-ECHILD);
+ res = get(NULL, inode, &last->done);
+ if (res == ERR_PTR(-ECHILD)) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
+ res = get(dentry, inode, &last->done);
+ }
+ } else {
+ res = get(dentry, inode, &last->done);
}
- res = inode->i_op->follow_link(dentry, &last->cookie);
- if (IS_ERR_OR_NULL(res)) {
- last->cookie = NULL;
+ if (IS_ERR_OR_NULL(res))
return res;
- }
}
if (*res == '/') {
if (nd->flags & LOOKUP_RCU) {
last = nd->stack + nd->depth++;
last->link = *link;
- last->cookie = NULL;
- last->inode = inode;
+ clear_delayed_call(&last->done);
+ nd->link_inode = inode;
last->seq = seq;
return 1;
}
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link(). Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link(). Using (or not using) it
+ * for any given inode is up to filesystem.
*/
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- void *cookie;
+ DEFINE_DELAYED_CALL(done);
struct inode *inode = d_inode(dentry);
const char *link = inode->i_link;
int res;
if (!link) {
- link = inode->i_op->follow_link(dentry, &cookie);
+ link = inode->i_op->get_link(dentry, inode, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
res = readlink_copy(buffer, buflen, link);
- if (inode->i_op->put_link)
- inode->i_op->put_link(inode, cookie);
+ do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
char *kaddr;
struct page *page;
- struct address_space *mapping = dentry->d_inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
- *ppage = page;
- kaddr = kmap(page);
- nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+ struct address_space *mapping = inode->i_mapping;
+
+ if (!dentry) {
+ page = find_get_page(mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ page = read_mapping_page(mapping, 0, NULL);
+ if (IS_ERR(page))
+ return (char*)page;
+ }
+ set_delayed_call(callback, page_put_link, page);
+ BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+ kaddr = page_address(page);
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
- struct page *page = NULL;
- int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
- return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
{
- struct page *page = NULL;
- char *res = page_getlink(dentry, &page);
- if (!IS_ERR(res))
- *cookie = page;
- return res;
+ put_page(arg);
}
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct page *page = cookie;
- kunmap(page);
- page_cache_release(page);
+ DEFINE_DELAYED_CALL(done);
+ int res = readlink_copy(buffer, buflen,
+ page_get_link(dentry, d_inode(dentry),
+ &done));
+ do_delayed_call(&done);
+ return res;
}
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
struct page *page;
void *fsdata;
int err;
- char *kaddr;
unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
if (nofs)
flags |= AOP_FLAG_NOFS;
if (err)
goto fail;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len-1);
- kunmap_atomic(kaddr);
+ memcpy(page_address(page), symname, len-1);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
static const struct inode_operations ncp_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ncp_notify_change,
};
#endif
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &ncp_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &ncp_symlink_aops;
#endif
} else {
inode->i_fop = NULL;
inode->i_flags |= S_AUTOMOUNT;
}
- } else if (S_ISLNK(inode->i_mode))
+ } else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nfs_symlink_inode_operations;
- else
+ inode_nohighmem(inode);
+ } else
init_special_inode(inode, inode->i_mode, fattr->rdev);
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
|| NFS_STALE(inode);
}
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
/**
* __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
int error;
error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
- POSIX_ACL_XATTR_ACCESS, data, size, &result);
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
if (error)
return error;
error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
- POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
if (error)
return error;
return result;
const void *buf, size_t buflen,
int flags)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
struct dentry *dentry, const char *key,
void *buf, size_t buflen)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
-static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
- if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
- return 0;
-
- if (list && len <= list_len)
- memcpy(list, XATTR_NAME_NFSV4_ACL, len);
- return len;
+ return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
}
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
- return server->caps & NFS_CAP_SECURITY_LABEL;
-}
static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
struct dentry *dentry, const char *key,
return -EOPNOTSUPP;
}
-static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
{
- size_t len = 0;
+ int len = 0;
- if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
- len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
- if (list && len <= list_len)
- security_inode_listsecurity(d_inode(dentry), list, len);
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (list_len && len > list_len)
+ return -ERANGE;
}
return len;
}
static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = nfs4_xattr_list_nfs4_label,
.get = nfs4_xattr_get_nfs4_label,
.set = nfs4_xattr_set_nfs4_label,
};
-#endif
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
#endif
};
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+ return error + error2;
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
- .prefix = XATTR_NAME_NFSV4_ACL,
+ .name = XATTR_NAME_NFSV4_ACL,
.list = nfs4_xattr_list_nfs4_acl,
.get = nfs4_xattr_get_nfs4_acl,
.set = nfs4_xattr_set_nfs4_acl,
return -EIO;
}
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct page *page;
void *err;
- err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
- if (err)
- return err;
- page = read_cache_page(&inode->i_data, 0,
- (filler_t *)nfs_symlink_filler, inode);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *cookie = page;
- return kmap(page);
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0,
+ (filler_t *)nfs_symlink_filler, inode);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
}
/*
*/
const struct inode_operations nfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = nfs_follow_link,
- .put_link = page_put_link,
+ .get_link = nfs_get_link,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
};
mutex_lock(&ls->ls_mutex);
nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+ mutex_unlock(&ls->ls_mutex);
}
static int
trace_layout_recall_release(&ls->ls_stid.sc_stateid);
- mutex_unlock(&ls->ls_mutex);
nfsd4_return_all_layouts(ls, &reaplist);
nfsd4_free_layouts(&reaplist);
nfs4_put_stid(&ls->ls_stid);
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
} else {
inode->i_op = &nilfs_special_inode_operations;
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.permission = nilfs_permission,
};
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ else
+ res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
break;
case S_IFLNK:
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
*/
locks_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
ocfs2_file_unlock(file);
}
inode->i_rdev = 0;
newsize = l - 1;
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
u16 cl_cpg,
+ u16 old_bg_clusters,
int set)
{
int i;
u16 backups = 0;
- u32 cluster;
+ u32 cluster, lgd_cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
else if (gd_blkno > lgd_blkno)
break;
+ /* check if already done backup super */
+ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
+ lgd_cluster += old_bg_clusters;
+ if (lgd_cluster >= cluster)
+ continue;
+
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+ u16 old_bg_clusters;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 1);
+ cl_cpg, old_bg_clusters, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 0);
+ cl_cpg, old_bg_clusters, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
const struct inode_operations ocfs2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
.setxattr = generic_setxattr,
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
-
- return handler ? handler->prefix : NULL;
+ return handler ? xattr_prefix(handler) : NULL;
}
static u32 ocfs2_xattr_name_hash(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
- size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+ char *buffer, size_t size,
+ size_t *result, int type,
const char *name, int name_len)
{
char *p = buffer + *result;
- int prefix_len = strlen(prefix);
- int total_len = prefix_len + name_len + 1;
+ const char *prefix;
+ int prefix_len;
+ int total_len;
+ switch(type) {
+ case OCFS2_XATTR_INDEX_USER:
+ if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+ case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+ if (!(sb->s_flags & MS_POSIXACL))
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+ break;
+ }
+
+ prefix = ocfs2_xattr_prefix(type);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ total_len = prefix_len + name_len + 1;
*result += total_len;
/* we are just looking for how big our buffer needs to be */
{
size_t result = 0;
int i, type, ret;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
-
- if (prefix) {
- name = (const char *)header +
- le16_to_cpu(entry->xe_name_offset);
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
- ret = ocfs2_xattr_list_entry(buffer, buffer_size,
- &result, prefix, name,
- entry->xe_name_len);
- if (ret)
- return ret;
- }
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ buffer, buffer_size,
+ &result, type, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
}
return result;
int ret = 0, type;
struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
int i, block_off, new_offset;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
- if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
- bucket_xh(bucket),
- i,
- &block_off,
- &new_offset);
- if (ret)
- break;
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
- name = (const char *)bucket_block(bucket, block_off) +
- new_offset;
- ret = ocfs2_xattr_list_entry(xl->buffer,
- xl->buffer_size,
- &xl->result,
- prefix, name,
- entry->xe_name_len);
- if (ret)
- break;
- }
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ type, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
}
return ret;
leave:
return ret;
}
+
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
.set = ocfs2_xattr_security_set,
};
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
.set = ocfs2_xattr_trusted_set,
};
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
.set = ocfs2_xattr_user_set,
};
return err;
}
-
-struct ovl_link_data {
- struct dentry *realdentry;
- void *cookie;
-};
-
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
- struct ovl_link_data *data = NULL;
- const char *ret;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
- if (WARN_ON(!realinode->i_op->follow_link))
+ if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- if (realinode->i_op->put_link) {
- data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
- data->realdentry = realdentry;
- }
-
- ret = realinode->i_op->follow_link(realdentry, cookie);
- if (IS_ERR_OR_NULL(ret)) {
- kfree(data);
- return ret;
- }
-
- if (data)
- data->cookie = *cookie;
-
- *cookie = data;
-
- return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
- struct inode *realinode;
- struct ovl_link_data *data = c;
-
- if (!data)
- return;
-
- realinode = data->realdentry->d_inode;
- realinode->i_op->put_link(realinode, data->cookie);
- kfree(data);
+ return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
- .follow_link = ovl_follow_link,
- .put_link = ovl_put_link,
+ .get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
struct posix_acl *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!IS_POSIXACL(d_backing_inode(dentry)))
return -EOPNOTSUPP;
if (d_is_symlink(dentry))
struct posix_acl *acl = NULL;
int ret;
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
return ret;
}
-static size_t
-posix_acl_xattr_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
{
- const char *xname = handler->prefix;
- size_t size;
-
- if (!IS_POSIXACL(d_backing_inode(dentry)))
- return 0;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return IS_POSIXACL(d_backing_inode(dentry));
}
const struct xattr_handler posix_acl_access_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
const struct xattr_handler posix_acl_default_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
return -ENOENT;
}
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct path path;
int error = -EACCES;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
/* Are we allowed to snoop on the tasks file descriptors? */
if (!proc_fd_access_allowed(inode))
goto out;
const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
+ .get_link = proc_pid_get_link,
.setattr = proc_setattr,
};
.d_delete = pid_delete_dentry,
};
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
{
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
* path to the file in question.
*/
static const char *
-proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+proc_map_files_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- return proc_pid_follow_link(dentry, NULL);
+ return proc_pid_get_link(dentry, inode, done);
}
/*
- * Identical to proc_pid_link_inode_operations except for follow_link()
+ * Identical to proc_pid_link_inode_operations except for get_link()
*/
static const struct inode_operations proc_map_files_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_map_files_follow_link,
+ .get_link = proc_map_files_get_link,
.setattr = proc_setattr,
};
return -ENOENT;
ei = PROC_I(inode);
- ei->op.proc_get_link = proc_map_files_get_link;
+ ei->op.proc_get_link = map_files_get_link;
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
+ ret = 0;
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
};
#endif
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static void proc_put_link(void *p)
{
- struct proc_dir_entry *pde = PDE(d_inode(dentry));
- if (unlikely(!use_pde(pde)))
- return ERR_PTR(-EINVAL);
- *cookie = pde;
- return pde->data;
+ unuse_pde(p);
}
-static void proc_put_link(struct inode *unused, void *p)
+static const char *proc_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unuse_pde(p);
+ struct proc_dir_entry *pde = PDE(inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ set_delayed_call(done, proc_put_link, pde);
+ return pde->data;
}
const struct inode_operations proc_link_inode_operations = {
.readlink = generic_readlink,
- .follow_link = proc_follow_link,
- .put_link = proc_put_link,
+ .get_link = proc_get_link,
};
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
&mntns_operations,
};
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
task = get_proc_task(inode);
if (!task)
return error;
static const struct inode_operations proc_ns_link_inode_operations = {
.readlink = proc_ns_readlink,
- .follow_link = proc_ns_follow_link,
+ .get_link = proc_ns_get_link,
.setattr = proc_setattr,
};
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
if (!tgid)
return ERR_PTR(-ENOENT);
/* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d", tgid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_self_get_link,
};
static unsigned self_inum;
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+ dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d/task/%d", tgid, pid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_thread_self_inode_operations = {
.readlink = proc_thread_self_readlink,
- .follow_link = proc_thread_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_thread_self_get_link,
};
static unsigned thread_self_inum;
inode->i_fop = &qnx4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx4_aops;
qnx4_i(inode)->mmu_private = inode->i_size;
} else {
inode->i_mapping->a_ops = &qnx6_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx6_aops;
} else
init_special_inode(inode, inode->i_mode, 0);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
}
inode->i_fop = &reiserfs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
} else {
inode->i_blocks = 0;
reiserfs_update_inode_transaction(parent_dir);
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
*/
const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
return NULL;
for_each_xattr_handler(handlers, xah) {
- if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+ const char *prefix = xattr_prefix(xah);
+ if (strncmp(prefix, name, strlen(prefix)) == 0)
break;
}
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
- if (!handler) /* Unsupported xattr name */
+ if (!handler /* Unsupported xattr name */ ||
+ (handler->list && !handler->list(b->dentry)))
return 0;
+ size = namelen + 1;
if (b->buf) {
- size = handler->list(handler, b->dentry,
- b->buf + b->pos, b->size, name,
- namelen);
if (size > b->size)
return -ERANGE;
- } else {
- size = handler->list(handler, b->dentry,
- NULL, 0, name, namelen);
+ memcpy(b->buf + b->pos, name, namelen);
+ b->buf[b->pos + namelen] = 0;
}
-
b->pos += size;
}
return 0;
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t security_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t namelen)
+static bool security_list(struct dentry *dentry)
{
- const size_t len = namelen + 1;
-
- if (IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_len) {
- memcpy(list, name, namelen);
- list[namelen] = '\0';
- }
-
- return len;
+ return !IS_PRIVATE(d_inode(dentry));
}
/* Initializes the security context for a new inode and returns the number
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t trusted_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool trusted_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
}
const struct xattr_handler reiserfs_xattr_trusted_handler = {
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t user_list(const struct xattr_handler *handler,
- struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len)
+static bool user_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!reiserfs_xattrs_user(dentry->d_sb))
- return 0;
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return reiserfs_xattrs_user(dentry->d_sb);
}
const struct xattr_handler reiserfs_xattr_user_handler = {
break;
case ROMFH_SYM:
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &romfs_aops;
mode |= S_IRWXUGO;
break;
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/xattr.h>
+#include <linux/pagemap.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
inode->i_op = &squashfs_symlink_inode_ops;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
const struct inode_operations squashfs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
struct squashfs_xattr_entry entry;
struct squashfs_xattr_val val;
const struct xattr_handler *handler;
- int name_size, prefix_size = 0;
+ int name_size;
err = squashfs_read_metadata(sb, &entry, &start, &offset,
sizeof(entry));
name_size = le16_to_cpu(entry.size);
handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
- if (handler)
- prefix_size = handler->list(handler, d, buffer, rest,
- NULL, name_size);
- if (prefix_size) {
+ if (handler && (!handler->list || handler->list(d))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_size = strlen(prefix);
+
if (buffer) {
if (prefix_size + name_size + 1 > rest) {
err = -ERANGE;
goto failed;
}
+ memcpy(buffer, prefix, prefix_size);
buffer += prefix_size;
}
err = squashfs_read_metadata(sb, buffer, &start,
}
-static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler,
- struct dentry *d, char *list,
- size_t list_size, const char *name,
- size_t name_len)
-{
- int len = strlen(handler->prefix);
-
- if (list && len <= list_size)
- memcpy(list, handler->prefix, len);
- return len;
-}
-
static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
struct dentry *d, const char *name,
void *buffer, size_t size)
{
- if (name[0] == '\0')
- return -EINVAL;
-
return squashfs_xattr_get(d_inode(d), handler->flags, name,
buffer, size);
}
static const struct xattr_handler squashfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = SQUASHFS_XATTR_USER,
- .list = squashfs_xattr_handler_list,
.get = squashfs_xattr_handler_get
};
/*
* Trusted namespace support
*/
-static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler,
- struct dentry *d, char *list,
- size_t list_size, const char *name,
- size_t name_len)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
{
- if (!capable(CAP_SYS_ADMIN))
- return 0;
- return squashfs_xattr_handler_list(handler, d, list, list_size, name,
- name_len);
+ return capable(CAP_SYS_ADMIN);
}
static const struct xattr_handler squashfs_xattr_trusted_handler = {
static const struct xattr_handler squashfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = SQUASHFS_XATTR_SECURITY,
- .list = squashfs_xattr_handler_list,
.get = squashfs_xattr_handler_get
};
static const struct inode_operations sysv_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = sysv_getattr,
};
inode->i_mapping->a_ops = &sysv_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &sysv_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &sysv_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.setxattr = ubifs_setxattr,
break;
case ICBTAG_FILE_TYPE_SYMLINK:
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
break;
case ICBTAG_FILE_TYPE_MAIN:
}
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
struct kernel_lb_addr eloc;
.rename = udf_rename,
.tmpfile = udf_tmpfile,
};
-const struct inode_operations udf_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err;
- unsigned char *p = kmap(page);
+ unsigned char *p = page_address(page);
struct udf_inode_info *iinfo;
uint32_t pos;
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
up_read(&iinfo->i_data_sem);
SetPageError(page);
out_unmap:
- kunmap(page);
unlock_page(page);
return err;
}
extern const struct file_operations udf_dir_operations;
extern const struct inode_operations udf_file_inode_operations;
extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
extern const struct address_space_operations udf_aops;
extern const struct address_space_operations udf_adinicb_aops;
extern const struct address_space_operations udf_symlink_aops;
obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
- namei.o super.o symlink.o util.o
+ namei.o super.o util.o
ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
inode->i_mapping->a_ops = &ufs_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (!inode->i_blocks) {
- inode->i_op = &ufs_fast_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ inode->i_op = &simple_symlink_inode_operations;
} else {
- inode->i_op = &ufs_symlink_inode_operations;
inode->i_mapping->a_ops = &ufs_aops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
}
} else
init_special_inode(inode, inode->i_mode,
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
- inode->i_op = &ufs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &ufs_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
} else {
/* fast symlink */
- inode->i_op = &ufs_fast_symlink_inode_operations;
+ inode->i_op = &simple_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
+++ /dev/null
-/*
- * linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/symlink.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = simple_follow_link,
- .setattr = ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
- .setattr = ufs_setattr,
-};
void ufs_panic(struct super_block *, const char *, const char *, ...);
void ufs_mark_sb_dirty(struct super_block *sb);
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
return error;
}
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
- const char *value, size_t size, gfp_t flags)
-{
- char *xattr_value = NULL;
- int rc;
-
- rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
- if (rc < 0)
- return rc;
-
- if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
- rc = -EINVAL;
- else
- rc = 0;
- kfree(xattr_value);
- return rc;
-}
-
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
return NULL;
for_each_xattr_handler(handlers, handler) {
- const char *n = strcmp_prefix(*name, handler->prefix);
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
*name = n;
- break;
+ return handler;
}
}
- return handler;
+ return ERR_PTR(-EOPNOTSUPP);
}
/*
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
return handler->get(handler, dentry, name, buffer, size);
}
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
- size += handler->list(handler, dentry, NULL, 0,
- NULL, 0);
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ size += strlen(handler->name) + 1;
}
} else {
char *buf = buffer;
+ size_t len;
for_each_xattr_handler(handlers, handler) {
- size = handler->list(handler, dentry, buf, buffer_size,
- NULL, 0);
- if (size > buffer_size)
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ len = strlen(handler->name);
+ if (len + 1 > buffer_size)
return -ERANGE;
- buf += size;
- buffer_size -= size;
+ memcpy(buf, handler->name, len + 1);
+ buf += len + 1;
+ buffer_size -= len + 1;
}
size = buf - buffer;
}
if (size == 0)
value = ""; /* empty EA, do not remove */
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
return handler->set(handler, dentry, name, value, size, flags);
}
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
const char *xattr_full_name(const struct xattr_handler *handler,
const char *name)
{
- size_t prefix_len = strlen(handler->prefix);
+ size_t prefix_len = strlen(xattr_prefix(handler));
return name - prefix_len;
}
return ret;
}
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+ const void *value, size_t size, int flags)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
}
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
-{
- if (size == 0)
- value = ""; /* empty EA, do not remove */
- return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+static bool xattr_is_trusted(const char *name)
{
- return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static bool xattr_is_trusted(const char *name)
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+ const char *name)
{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ size_t len = strlen(name) + 1;
+ if (*buffer) {
+ if (*remaining_size < len)
+ return -ERANGE;
+ memcpy(*buffer, name, len);
+ *buffer += len;
+ }
+ *remaining_size -= len;
+ return 0;
}
/*
* xattr LIST operation for in-memory/pseudo filesystems
*/
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
- size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+ char *buffer, size_t size)
{
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
- size_t used = 0;
+ ssize_t remaining_size = size;
+ int err;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (inode->i_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err)
+ return err;
+ }
+ if (inode->i_default_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err)
+ return err;
+ }
+#endif
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
- size_t len;
-
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
+ err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+ if (err)
+ return err;
}
spin_unlock(&xattrs->lock);
- return used;
+ return size - remaining_size;
}
/*
return error;
}
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
- int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
- return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
- ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
- return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
- if (!S_ISDIR(inode->i_mode))
- return 0;
- return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
-# define posix_acl_access_exists(inode) 0
-# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
* uio is kmalloced for this reason...
*/
STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
struct dentry *dentry,
- void **cookie)
+ struct inode *inode,
+ struct delayed_call *done)
{
char *link;
int error = -ENOMEM;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
if (!link)
goto out_err;
if (unlikely(error))
goto out_kfree;
- return *cookie = link;
+ set_delayed_call(done, kfree_link, link);
+ return link;
out_kfree:
kfree(link);
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = xfs_vn_follow_link,
- .put_link = kfree_put_link,
+ .get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
/* Convert Linux syscall to XFS internal ATTR flags */
if (!size) {
xflags |= ATTR_KERNOVAL;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error;
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
/* Convert Linux syscall to XFS internal ATTR flags */
if (flags & XATTR_CREATE)
xflags |= ATTR_CREATE;
NULL
};
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return sizeof("security");
- else if (flags & XFS_ATTR_ROOT)
- return sizeof("trusted");
- else
- return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return xfs_xattr_security_handler.prefix;
- else if (flags & XFS_ATTR_ROOT)
- return xfs_xattr_trusted_handler.prefix;
- else
- return xfs_xattr_user_handler.prefix;
-}
-
static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen,
- unsigned char *value)
+ char *prefix,
+ int prefix_len,
+ unsigned char *name,
+ int namelen)
{
- unsigned int prefix_len = xfs_xattr_prefix_len(flags);
char *offset;
int arraytop;
- ASSERT(context->count >= 0);
-
- /*
- * Only show root namespace entries if we are actually allowed to
- * see them.
- */
- if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
- return 0;
+ if (!context->alist)
+ goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
return 1;
}
offset = (char *)context->alist + context->count;
- strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
+
+compute_size:
context->count += prefix_len + namelen + 1;
return 0;
}
static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
unsigned char *name,
int valuelen,
unsigned char *value)
{
- context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
- return 0;
-}
+ char *prefix;
+ int prefix_len;
-static int
-list_one_attr(const char *name, const size_t len, void *data,
- size_t size, ssize_t *result)
-{
- char *p = data + *result;
+ ASSERT(context->count >= 0);
- *result += len;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
+ if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (namelen == SGI_ACL_FILE_SIZE &&
+ strncmp(name, SGI_ACL_FILE,
+ SGI_ACL_FILE_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_ACCESS,
+ strlen(XATTR_POSIX_ACL_ACCESS));
+ if (ret)
+ return ret;
+ } else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+ strncmp(name, SGI_ACL_DEFAULT,
+ SGI_ACL_DEFAULT_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_DEFAULT,
+ strlen(XATTR_POSIX_ACL_DEFAULT));
+ if (ret)
+ return ret;
+ }
+#endif
- strcpy(p, name);
- return 0;
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ } else if (flags & XFS_ATTR_SECURE) {
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ } else {
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ }
+
+ return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
}
ssize_t
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
- int error;
/*
* First read the regular on-disk attributes.
context.dp = XFS_I(inode);
context.cursor = &cursor;
context.resynch = 1;
- context.alist = data;
+ context.alist = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
-
- if (size)
- context.put_listent = xfs_xattr_put_listent;
- else
- context.put_listent = xfs_xattr_put_listent_sizes;
+ context.put_listent = xfs_xattr_put_listent;
xfs_attr_list_int(&context);
if (context.count < 0)
return -ERANGE;
- /*
- * Then add the two synthetic ACL attributes.
- */
- if (posix_acl_access_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
- if (posix_acl_default_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
return context.count;
}
#endif /* CONFIG_SMP */
#ifndef smp_store_mb
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
#endif
#ifndef smp_mb__before_atomic
* GNU General Public License for more details.
*
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
*/
#ifndef __ASM_GENERIC_QSPINLOCK_H
#define __ASM_GENERIC_QSPINLOCK_H
static __always_inline int queued_spin_trylock(struct qspinlock *lock)
{
if (!atomic_read(&lock->val) &&
- (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+ (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
return 1;
return 0;
}
{
u32 val;
- val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+ val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (likely(val == 0))
return;
queued_spin_lock_slowpath(lock, val);
/*
* smp_mb__before_atomic() in order to guarantee release semantics
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_sub(_Q_LOCKED_VAL, &lock->val);
}
#endif
extern int blk_queue_enter(struct request_queue *q, gfp_t gfp);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_start_queue(struct request_queue *q);
+extern void blk_start_queue_async(struct request_queue *q);
extern void blk_stop_queue(struct request_queue *q);
extern void blk_sync_queue(struct request_queue *q);
extern void __blk_stop_queue(struct request_queue *q);
__u.__val; \
})
+/**
+ * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ *
+ * The control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. ACQUIRE.
+ */
+#define smp_cond_acquire(cond) do { \
+ while (!(cond)) \
+ cpu_relax(); \
+ smp_rmb(); /* ctrl + rmb := acquire */ \
+} while (0)
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
--- /dev/null
+#ifndef _DELAYED_CALL_H
+#define _DELAYED_CALL_H
+
+/*
+ * Poor man's closures; I wish we could've done them sanely polymorphic,
+ * but...
+ */
+
+struct delayed_call {
+ void (*fn)(void *);
+ void *arg;
+};
+
+#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}
+
+/* I really wish we had closures with sane typechecking... */
+static inline void set_delayed_call(struct delayed_call *call,
+ void (*fn)(void *), void *arg)
+{
+ call->fn = fn;
+ call->arg = arg;
+}
+
+static inline void do_delayed_call(struct delayed_call *call)
+{
+ if (call->fn)
+ call->fn(call->arg);
+}
+
+static inline void clear_delayed_call(struct delayed_call *call)
+{
+ call->fn = NULL;
+}
+#endif
/* A few generic types ... taken from ses-2 */
enum enclosure_component_type {
ENCLOSURE_COMPONENT_DEVICE = 0x01,
+ ENCLOSURE_COMPONENT_CONTROLLER_ELECTRONICS = 0x07,
+ ENCLOSURE_COMPONENT_SCSI_TARGET_PORT = 0x14,
+ ENCLOSURE_COMPONENT_SCSI_INITIATOR_PORT = 0x15,
ENCLOSURE_COMPONENT_ARRAY_DEVICE = 0x17,
+ ENCLOSURE_COMPONENT_SAS_EXPANDER = 0x18,
};
/* ses-2 common element status */
#define BPF_ANC BIT(15)
+static inline bool bpf_needs_clear_a(const struct sock_filter *first)
+{
+ switch (first->code) {
+ case BPF_RET | BPF_K:
+ case BPF_LD | BPF_W | BPF_LEN:
+ return false;
+
+ case BPF_LD | BPF_W | BPF_ABS:
+ case BPF_LD | BPF_H | BPF_ABS:
+ case BPF_LD | BPF_B | BPF_ABS:
+ if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
+ return true;
+ return false;
+
+ default:
+ return true;
+ }
+}
+
static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
BUG_ON(ftest->code & BPF_ANC);
#include <linux/blk_types.h>
#include <linux/workqueue.h>
#include <linux/percpu-rwsem.h>
+#include <linux/delayed_call.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
- const char * (*follow_link) (struct dentry *, void **);
+ const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
int (*permission) (struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
- void (*put_link) (struct inode *, void *);
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
extern int readlink_copy(char __user *, int, const char *);
extern int page_readlink(struct dentry *, char __user *, int);
-extern const char *page_follow_link_light(struct dentry *, void **);
-extern void page_put_link(struct inode *, void *);
+extern const char *page_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
+extern void page_put_link(void *);
extern int __page_symlink(struct inode *inode, const char *symname, int len,
int nofs);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
-extern void kfree_put_link(struct inode *, void *);
-extern void free_page_put_link(struct inode *, void *);
+extern void kfree_link(void *);
extern int generic_readlink(struct dentry *, char __user *, int);
extern void generic_fillattr(struct inode *, struct kstat *);
int vfs_getattr_nosec(struct path *path, struct kstat *stat);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
-const char *simple_follow_link(struct dentry *, void **);
+const char *simple_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;
extern int iterate_dir(struct file *, struct dir_context *);
extern int vfs_fstat(unsigned int, struct kstat *);
extern int vfs_fstatat(int , const char __user *, struct kstat *, int);
-extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
- unsigned long arg);
extern int __generic_block_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo,
loff_t start, loff_t len,
}
extern bool path_noexec(const struct path *path);
+extern void inode_nohighmem(struct inode *inode);
#endif /* _LINUX_FS_H */
extern int skip_trace(unsigned long ip);
extern void ftrace_module_init(struct module *mod);
+extern void ftrace_release_mod(struct module *mod);
extern void ftrace_disable_daemon(void);
extern void ftrace_enable_daemon(void);
static inline void INIT_LIST_HEAD(struct list_head *list)
{
- list->next = list;
+ WRITE_ONCE(list->next, list);
list->prev = list;
}
next->prev = new;
new->next = next;
new->prev = prev;
- prev->next = new;
+ WRITE_ONCE(prev->next, new);
}
#else
extern void __list_add(struct list_head *new,
*/
static inline int list_empty(const struct list_head *head)
{
- return head->next == head;
+ return READ_ONCE(head->next) == head;
}
/**
static inline int hlist_empty(const struct hlist_head *h)
{
- return !h->first;
+ return !READ_ONCE(h->first);
}
static inline void __hlist_del(struct hlist_node *n)
n->next = first;
if (first)
first->pprev = &n->next;
- h->first = n;
+ WRITE_ONCE(h->first, n);
n->pprev = &h->first;
}
n->pprev = next->pprev;
n->next = next;
next->pprev = &n->next;
- *(n->pprev) = n;
+ WRITE_ONCE(*(n->pprev), n);
}
static inline void hlist_add_behind(struct hlist_node *n,
struct hlist_node *prev)
{
n->next = prev->next;
- prev->next = n;
+ WRITE_ONCE(prev->next, n);
n->pprev = &prev->next;
if (n->next)
static inline int hlist_bl_empty(const struct hlist_bl_head *h)
{
- return !((unsigned long)h->first & ~LIST_BL_LOCKMASK);
+ return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}
static inline void hlist_bl_add_head(struct hlist_bl_node *n,
static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
- return is_a_nulls(h->first);
+ return is_a_nulls(READ_ONCE(h->first));
}
static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
#ifndef LINUX_MM_DEBUG_H
#define LINUX_MM_DEBUG_H 1
+#include <linux/bug.h>
#include <linux/stringify.h>
struct page;
#define SNOR_MFR_MACRONIX CFI_MFR_MACRONIX
#define SNOR_MFR_SPANSION CFI_MFR_AMD
#define SNOR_MFR_SST CFI_MFR_SST
-#define SNOR_MFR_WINBOND 0xef
+#define SNOR_MFR_WINBOND 0xef /* Also used by some Spansion */
/*
* Note on opcode nomenclature: some opcodes have a format like
})
#define netdev_alloc_pcpu_stats(type) \
- __netdev_alloc_pcpu_stats(type, GFP_KERNEL);
+ __netdev_alloc_pcpu_stats(type, GFP_KERNEL)
#include <linux/notifier.h>
int (*call_rcu)(struct sock *nl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[]);
- int (*call_batch)(struct sock *nl, struct sk_buff *skb,
+ int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[]);
const struct nla_policy *policy; /* netlink attribute policy */
extern int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode);
extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
+extern int nfs_revalidate_mapping_rcu(struct inode *inode);
extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping);
extern int nfs_setattr(struct dentry *, struct iattr *);
extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *);
struct edma_rsv_info *rsv;
/* List of channels allocated for memcpy, terminated with -1 */
- s16 *memcpy_channels;
+ s32 *memcpy_channels;
s8 (*queue_priority_mapping)[2];
const s16 (*xbar_chans)[2];
#ifndef _POSIX_ACL_XATTR_H
#define _POSIX_ACL_XATTR_H
+#include <uapi/linux/xattr.h>
#include <linux/posix_acl.h>
-/* Extended attribute names */
-#define POSIX_ACL_XATTR_ACCESS "system.posix_acl_access"
-#define POSIX_ACL_XATTR_DEFAULT "system.posix_acl_default"
-
/* Supported ACL a_version fields */
#define POSIX_ACL_XATTR_VERSION 0x0002
-
/* An undefined entry e_id value */
#define ACL_UNDEFINED_ID (-1)
#ifndef __COMMON_HSI__
#define __COMMON_HSI__
+#define CORE_SPQE_PAGE_SIZE_BYTES 4096
+
#define FW_MAJOR_VERSION 8
#define FW_MINOR_VERSION 4
#define FW_REVISION_VERSION 2
used = ((u32)0x10000u + (u32)(p_chain->prod_idx)) -
(u32)p_chain->cons_idx;
if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR)
- used -= (used / p_chain->elem_per_page);
+ used -= p_chain->prod_idx / p_chain->elem_per_page -
+ p_chain->cons_idx / p_chain->elem_per_page;
return p_chain->capacity - used;
}
}
/**
- * list_splice_init_rcu - splice an RCU-protected list into an existing list.
+ * __list_splice_init_rcu - join an RCU-protected list into an existing list.
* @list: the RCU-protected list to splice
- * @head: the place in the list to splice the first list into
+ * @prev: points to the last element of the existing list
+ * @next: points to the first element of the existing list
* @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
*
- * @head can be RCU-read traversed concurrently with this function.
+ * The list pointed to by @prev and @next can be RCU-read traversed
+ * concurrently with this function.
*
* Note that this function blocks.
*
- * Important note: the caller must take whatever action is necessary to
- * prevent any other updates to @head. In principle, it is possible
- * to modify the list as soon as sync() begins execution.
- * If this sort of thing becomes necessary, an alternative version
- * based on call_rcu() could be created. But only if -really-
- * needed -- there is no shortage of RCU API members.
+ * Important note: the caller must take whatever action is necessary to prevent
+ * any other updates to the existing list. In principle, it is possible to
+ * modify the list as soon as sync() begins execution. If this sort of thing
+ * becomes necessary, an alternative version based on call_rcu() could be
+ * created. But only if -really- needed -- there is no shortage of RCU API
+ * members.
*/
-static inline void list_splice_init_rcu(struct list_head *list,
- struct list_head *head,
- void (*sync)(void))
+static inline void __list_splice_init_rcu(struct list_head *list,
+ struct list_head *prev,
+ struct list_head *next,
+ void (*sync)(void))
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
- struct list_head *at = head->next;
-
- if (list_empty(list))
- return;
/*
* "first" and "last" tracking list, so initialize it. RCU readers
* this function.
*/
- last->next = at;
- rcu_assign_pointer(list_next_rcu(head), first);
- first->prev = head;
- at->prev = last;
+ last->next = next;
+ rcu_assign_pointer(list_next_rcu(prev), first);
+ first->prev = prev;
+ next->prev = last;
+}
+
+/**
+ * list_splice_init_rcu - splice an RCU-protected list into an existing list,
+ * designed for stacks.
+ * @list: the RCU-protected list to splice
+ * @head: the place in the existing list to splice the first list into
+ * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_init_rcu(struct list_head *list,
+ struct list_head *head,
+ void (*sync)(void))
+{
+ if (!list_empty(list))
+ __list_splice_init_rcu(list, head, head->next, sync);
+}
+
+/**
+ * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
+ * list, designed for queues.
+ * @list: the RCU-protected list to splice
+ * @head: the place in the existing list to splice the first list into
+ * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ...
+ */
+static inline void list_splice_tail_init_rcu(struct list_head *list,
+ struct list_head *head,
+ void (*sync)(void))
+{
+ if (!list_empty(list))
+ __list_splice_init_rcu(list, head->prev, head, sync);
}
/**
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
/**
+ * list_entry_lockless - get the struct for this entry
+ * @ptr: the &struct list_head pointer.
+ * @type: the type of the struct this is embedded in.
+ * @member: the name of the list_head within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding. One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends). Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_entry_lockless(ptr, type, member) \
+ container_of((typeof(ptr))lockless_dereference(ptr), type, member)
+
+/**
+ * list_for_each_entry_lockless - iterate over rcu list of given type
+ * @pos: the type * to use as a loop cursor.
+ * @head: the head for your list.
+ * @member: the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu(), but requires some implicit RCU
+ * read-side guarding. One example is running within a special
+ * exception-time environment where preemption is disabled and where
+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
+ * as in synchronize_sched(), call_rcu_sched(), and friends). Another
+ * example is when items are added to the list, but never deleted.
+ */
+#define list_for_each_entry_lockless(pos, head, member) \
+ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
+
+/**
* list_for_each_entry_continue_rcu - continue iteration over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
#include <asm/barrier.h>
+#ifndef CONFIG_TINY_RCU
extern int rcu_expedited; /* for sysctl */
+extern int rcu_normal; /* also for sysctl */
+#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
+{
+ return true;
+}
static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
{
return false;
{
}
#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
/* Internal to kernel */
void rcu_init(void);
-void rcu_end_inkernel_boot(void);
void rcu_sched_qs(void);
void rcu_bh_qs(void);
void rcu_check_callbacks(int user);
int rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu);
+#ifndef CONFIG_TINY_RCU
+void rcu_end_inkernel_boot(void);
+#else /* #ifndef CONFIG_TINY_RCU */
+static inline void rcu_end_inkernel_boot(void) { }
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
*/
#define RCU_NONIDLE(a) \
do { \
- rcu_irq_enter(); \
+ rcu_irq_enter_irqson(); \
do { a; } while (0); \
- rcu_irq_exit(); \
+ rcu_irq_exit_irqson(); \
} while (0)
/*
* The tracing infrastructure traces RCU (we want that), but unfortunately
* some of the RCU checks causes tracing to lock up the system.
*
- * The tracing version of rcu_dereference_raw() must not call
+ * The no-tracing version of rcu_dereference_raw() must not call
* rcu_read_lock_held().
*/
#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
{
}
+static inline void rcu_irq_exit_irqson(void)
+{
+}
+
+static inline void rcu_irq_enter_irqson(void)
+{
+}
+
static inline void rcu_irq_exit(void)
{
}
/*
* Note a virtualization-based context switch. This is simply a
* wrapper around rcu_note_context_switch(), which allows TINY_RCU
- * to save a few bytes.
+ * to save a few bytes. The caller must have disabled interrupts.
*/
static inline void rcu_virt_note_context_switch(int cpu)
{
void rcu_idle_exit(void);
void rcu_irq_enter(void);
void rcu_irq_exit(void);
+void rcu_irq_enter_irqson(void);
+void rcu_irq_exit_irqson(void);
void exit_rcu(void);
#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
int rhashtable_init(struct rhashtable *ht,
const struct rhashtable_params *params);
-int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
- struct rhash_head *obj,
- struct bucket_table *old_tbl);
-int rhashtable_insert_rehash(struct rhashtable *ht);
+struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+ const void *key,
+ struct rhash_head *obj,
+ struct bucket_table *old_tbl);
+int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
void rhashtable_walk_exit(struct rhashtable_iter *iter);
new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(new_tbl)) {
- err = rhashtable_insert_slow(ht, key, obj, new_tbl);
- if (err == -EAGAIN)
+ tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
+ if (!IS_ERR_OR_NULL(tbl))
goto slow_path;
+
+ err = PTR_ERR(tbl);
goto out;
}
if (unlikely(rht_grow_above_100(ht, tbl))) {
slow_path:
spin_unlock_bh(lock);
- err = rhashtable_insert_rehash(ht);
+ err = rhashtable_insert_rehash(ht, tbl);
rcu_read_unlock();
if (err)
return err;
/* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality;
- unsigned in_execve:1; /* Tell the LSMs that the process is doing an
- * execve */
- unsigned in_iowait:1;
-
- /* Revert to default priority/policy when forking */
+ /* scheduler bits, serialized by scheduler locks */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
+ unsigned :0; /* force alignment to the next boundary */
+
+ /* unserialized, strictly 'current' */
+ unsigned in_execve:1; /* bit to tell LSMs we're in execve */
+ unsigned in_iowait:1;
#ifdef CONFIG_MEMCG
unsigned memcg_may_oom:1;
#endif
}
/**
- * is_global_init - check if a task structure is init
+ * is_global_init - check if a task structure is init. Since init
+ * is free to have sub-threads we need to check tgid.
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
*/
static inline int is_global_init(struct task_struct *tsk)
{
- return tsk->pid == 1;
+ return task_tgid_nr(tsk) == 1;
}
extern struct pid *cad_pid;
TP_PROTO(data_proto), \
TP_ARGS(data_args), \
TP_CONDITION(cond), \
- rcu_irq_enter(), \
- rcu_irq_exit()); \
+ rcu_irq_enter_irqson(), \
+ rcu_irq_exit_irqson()); \
}
#else
#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
};
u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
+int cdc_ncm_change_mtu(struct net_device *net, int new_mtu);
int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags);
void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
#ifdef CONFIG_SMP
-void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int);
+void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);
-void mod_zone_page_state(struct zone *, enum zone_stat_item, int);
+void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);
* The functions directly modify the zone and global counters.
*/
static inline void __mod_zone_page_state(struct zone *zone,
- enum zone_stat_item item, int delta)
+ enum zone_stat_item item, long delta)
{
zone_page_state_add(delta, zone, item);
}
struct inode;
struct dentry;
+/*
+ * struct xattr_handler: When @name is set, match attributes with exactly that
+ * name. When @prefix is set instead, match attributes with that prefix and
+ * with a non-empty suffix.
+ */
struct xattr_handler {
+ const char *name;
const char *prefix;
int flags; /* fs private flags */
- size_t (*list)(const struct xattr_handler *, struct dentry *dentry,
- char *list, size_t list_size, const char *name,
- size_t name_len);
+ bool (*list)(struct dentry *dentry);
int (*get)(const struct xattr_handler *, struct dentry *dentry,
const char *name, void *buffer, size_t size);
int (*set)(const struct xattr_handler *, struct dentry *dentry,
int generic_removexattr(struct dentry *dentry, const char *name);
ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name,
char **xattr_value, size_t size, gfp_t flags);
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
- const char *value, size_t size, gfp_t flags);
+
+static inline const char *xattr_prefix(const struct xattr_handler *handler)
+{
+ return handler->prefix ?: handler->name;
+}
struct simple_xattrs {
struct list_head head;
void *buffer, size_t size);
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags);
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name);
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
size_t size);
void simple_xattr_list_add(struct simple_xattrs *xattrs,
struct simple_xattr *new_xattr);
}
}
+/**
+ * dst_hold_safe - Take a reference on a dst if possible
+ * @dst: pointer to dst entry
+ *
+ * This helper returns false if it could not safely
+ * take a reference on a dst.
+ */
+static inline bool dst_hold_safe(struct dst_entry *dst)
+{
+ if (dst->flags & DST_NOCACHE)
+ return atomic_inc_not_zero(&dst->__refcnt);
+ dst_hold(dst);
+ return true;
+}
+
+/**
+ * skb_dst_force_safe - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted and not destroyed, grab a ref on it.
+ */
+static inline void skb_dst_force_safe(struct sk_buff *skb)
+{
+ if (skb_dst_is_noref(skb)) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (!dst_hold_safe(dst))
+ dst = NULL;
+
+ skb->_skb_refdst = (unsigned long)dst;
+ }
+}
+
/**
* __skb_tunnel_rx - prepare skb for rx reinsert
#define IP_CMSG_ORIGDSTADDR BIT(6)
#define IP_CMSG_CHECKSUM BIT(7)
-/* SYNACK messages might be attached to request sockets.
+/**
+ * sk_to_full_sk - Access to a full socket
+ * @sk: pointer to a socket
+ *
+ * SYNACK messages might be attached to request sockets.
* Some places want to reach the listener in this case.
*/
-static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
+static inline struct sock *sk_to_full_sk(struct sock *sk)
{
- struct sock *sk = skb->sk;
-
+#ifdef CONFIG_INET
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
sk = inet_reqsk(sk)->rsk_listener;
+#endif
+ return sk;
+}
+
+/* sk_to_full_sk() variant with a const argument */
+static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
+{
+#ifdef CONFIG_INET
+ if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
+ sk = ((const struct request_sock *)sk)->rsk_listener;
+#endif
return sk;
}
+static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
+{
+ return sk_to_full_sk(skb->sk);
+}
+
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
return (struct inet_sock *)sk;
static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip)
{
iaddr->a4.addr = ip;
+ iaddr->a4.vif = 0;
iaddr->family = AF_INET;
}
/* IPv4 ops */
struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
const struct flowi4 *fl4);
- void (*l3mdev_get_saddr)(struct net_device *dev,
+ int (*l3mdev_get_saddr)(struct net_device *dev,
struct flowi4 *fl4);
/* IPv6 ops */
return rc;
}
-static inline void l3mdev_get_saddr(struct net *net, int ifindex,
- struct flowi4 *fl4)
+static inline int l3mdev_get_saddr(struct net *net, int ifindex,
+ struct flowi4 *fl4)
{
struct net_device *dev;
+ int rc = 0;
if (ifindex) {
dev = dev_get_by_index_rcu(net, ifindex);
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_get_saddr) {
- dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
+ rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);
}
rcu_read_unlock();
}
+
+ return rc;
}
static inline struct dst_entry *l3mdev_get_rt6_dst(const struct net_device *dev,
return false;
}
-static inline void l3mdev_get_saddr(struct net *net, int ifindex,
- struct flowi4 *fl4)
+static inline int l3mdev_get_saddr(struct net *net, int ifindex,
+ struct flowi4 *fl4)
{
+ return 0;
}
static inline
sport, dport, sk);
if (!src && oif) {
- l3mdev_get_saddr(net, oif, fl4);
+ int rc;
+
+ rc = l3mdev_get_saddr(net, oif, fl4);
+ if (rc < 0)
+ return ERR_PTR(rc);
+
src = fl4->saddr;
}
if (!dst || !src) {
* : SACK's are not delayed (see Section 6).
*/
__u8 sack_needed:1, /* Do we need to sack the peer? */
- sack_generation:1;
+ sack_generation:1,
+ zero_window_announced:1;
__u32 sack_cnt;
__u32 adaptation_ind; /* Adaptation Code point. */
struct socket_wq *sk_wq_raw;
};
#ifdef CONFIG_XFRM
- struct xfrm_policy *sk_policy[2];
+ struct xfrm_policy __rcu *sk_policy[2];
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
sk_userlocks : 4,
sk_protocol : 8,
sk_type : 16;
+#define SK_PROTOCOL_MAX U8_MAX
kmemcheck_bitfield_end(flags);
int sk_wmem_queued;
gfp_t sk_allocation;
SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
};
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
{
nsk->sk_flags = osk->sk_flags;
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
/* dont let skb dst not refcounted, we are going to leave rcu lock */
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
if (!sk->sk_backlog.tail)
sk->sk_backlog.head = skb;
};
/* VXLAN header flags. */
-#define VXLAN_HF_RCO BIT(24)
+#define VXLAN_HF_RCO BIT(21)
#define VXLAN_HF_VNI BIT(27)
#define VXLAN_HF_GBP BIT(31)
u16 family;
struct xfrm_sec_ctx *security;
struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH];
+ struct rcu_head rcu;
};
static inline struct net *xp_net(const struct xfrm_policy *xp)
return xfrm_route_forward(skb, AF_INET6);
}
-int __xfrm_sk_clone_policy(struct sock *sk);
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk);
-static inline int xfrm_sk_clone_policy(struct sock *sk)
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
- if (unlikely(sk->sk_policy[0] || sk->sk_policy[1]))
- return __xfrm_sk_clone_policy(sk);
+ sk->sk_policy[0] = NULL;
+ sk->sk_policy[1] = NULL;
+ if (unlikely(osk->sk_policy[0] || osk->sk_policy[1]))
+ return __xfrm_sk_clone_policy(sk, osk);
return 0;
}
static inline void xfrm_sk_free_policy(struct sock *sk)
{
- if (unlikely(sk->sk_policy[0] != NULL)) {
- xfrm_policy_delete(sk->sk_policy[0], XFRM_POLICY_MAX);
+ struct xfrm_policy *pol;
+
+ pol = rcu_dereference_protected(sk->sk_policy[0], 1);
+ if (unlikely(pol != NULL)) {
+ xfrm_policy_delete(pol, XFRM_POLICY_MAX);
sk->sk_policy[0] = NULL;
}
- if (unlikely(sk->sk_policy[1] != NULL)) {
- xfrm_policy_delete(sk->sk_policy[1], XFRM_POLICY_MAX+1);
+ pol = rcu_dereference_protected(sk->sk_policy[1], 1);
+ if (unlikely(pol != NULL)) {
+ xfrm_policy_delete(pol, XFRM_POLICY_MAX+1);
sk->sk_policy[1] = NULL;
}
}
#else
static inline void xfrm_sk_free_policy(struct sock *sk) {}
-static inline int xfrm_sk_clone_policy(struct sock *sk) { return 0; }
+static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; }
static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; }
static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb)
/* Helper functions */
static inline void snd_soc_dapm_mutex_lock(struct snd_soc_dapm_context *dapm)
{
- mutex_lock(&dapm->card->dapm_mutex);
+ mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME);
}
static inline void snd_soc_dapm_mutex_unlock(struct snd_soc_dapm_context *dapm)
header-y += if_vlan.h
header-y += if_x25.h
header-y += igmp.h
+header-y += ila.h
header-y += in6.h
header-y += inet_diag.h
header-y += in.h
* @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the
* mask, the corresponding bit in the value is copied to the connection
* tracking mark field in the connection.
- * @OVS_CT_ATTR_LABEL: %OVS_CT_LABELS_LEN value followed by %OVS_CT_LABELS_LEN
+ * @OVS_CT_ATTR_LABELS: %OVS_CT_LABELS_LEN value followed by %OVS_CT_LABELS_LEN
* mask. For each bit set in the mask, the corresponding bit in the value is
* copied to the connection tracking label field in the connection.
* @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
#define RING_GET_REQUEST(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req))
+/*
+ * Get a local copy of a request.
+ *
+ * Use this in preference to RING_GET_REQUEST() so all processing is
+ * done on a local copy that cannot be modified by the other end.
+ *
+ * Note that https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 may cause this
+ * to be ineffective where _req is a struct which consists of only bitfields.
+ */
+#define RING_COPY_REQUEST(_r, _idx, _req) do { \
+ /* Use volatile to force the copy into _req. */ \
+ *(_req) = *(volatile typeof(_req))RING_GET_REQUEST(_r, _idx); \
+} while (0)
+
#define RING_GET_RESPONSE(_r, _idx) \
(&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp))
flush_delayed_fput();
+ rcu_end_inkernel_boot();
+
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
#endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
+ tsk->wake_q.next = NULL;
account_kernel_stack(ti, 1);
}
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
* exist yet, look it up one more time to ensure we have a
* reference to it. If the lock was taken, ret contains the
* vpid of the top waiter task.
+ * If the lock was not taken, we have pi_state and an initial
+ * refcount on it. In case of an error we have nothing.
*/
if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
/*
- * If we acquired the lock, then the user
- * space value of uaddr2 should be vpid. It
- * cannot be changed by the top waiter as it
- * is blocked on hb2 lock if it tries to do
- * so. If something fiddled with it behind our
- * back the pi state lookup might unearth
- * it. So we rather use the known value than
- * rereading and handing potential crap to
- * lookup_pi_state.
+ * If we acquired the lock, then the user space value
+ * of uaddr2 should be vpid. It cannot be changed by
+ * the top waiter as it is blocked on hb2 lock if it
+ * tries to do so. If something fiddled with it behind
+ * our back the pi state lookup might unearth it. So
+ * we rather use the known value than rereading and
+ * handing potential crap to lookup_pi_state.
+ *
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
*/
ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
case 0:
+ /* We hold a reference on the pi state. */
break;
+
+ /* If the above failed, then pi_state is NULL */
case -EFAULT:
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
* exit to complete.
* - The user space value changed.
*/
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
* of requeue_pi if we couldn't acquire the lock atomically.
*/
if (requeue_pi) {
- /* Prepare the waiter to take the rt_mutex. */
+ /*
+ * Prepare the waiter to take the rt_mutex. Take a
+ * refcount on the pi_state and store the pointer in
+ * the futex_q object of the waiter.
+ */
atomic_inc(&pi_state->refcount);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
- /* We got the lock. */
+ /*
+ * We got the lock. We do neither drop the
+ * refcount on pi_state nor clear
+ * this->pi_state because the waiter needs the
+ * pi_state for cleaning up the user space
+ * value. It will drop the refcount after
+ * doing so.
+ */
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
} else if (ret) {
- /* -EDEADLK */
+ /*
+ * rt_mutex_start_proxy_lock() detected a
+ * potential deadlock when we tried to queue
+ * that waiter. Drop the pi_state reference
+ * which we took above and remove the pointer
+ * to the state from the waiters futex_q
+ * object.
+ */
this->pi_state = NULL;
- free_pi_state(pi_state);
- goto out_unlock;
+ put_pi_state(pi_state);
+ /*
+ * We stop queueing more waiters and let user
+ * space deal with the mess.
+ */
+ break;
}
}
requeue_futex(this, hb1, hb2, &key2);
drop_count++;
}
+ /*
+ * We took an extra initial reference to the pi_state either
+ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+ * need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
out_unlock:
- free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
}
} else {
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+ cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}
if (!desc)
return NULL;
+ chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
/*
if (!action) {
WARN(1, "Trying to free already-free IRQ %d\n", irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ chip_bus_sync_unlock(desc);
return NULL;
}
#endif
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
unregister_handler_proc(irq, action);
desc->affinity_notify = NULL;
#endif
- chip_bus_lock(desc);
kfree(__free_irq(irq, dev_id));
- chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL(free_irq);
#include <linux/capability.h>
#include <linux/compiler.h>
-#include <linux/rcupdate.h> /* rcu_expedited */
+#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
}
KERNEL_ATTR_RO(fscaps);
+#ifndef CONFIG_TINY_RCU
int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", rcu_expedited);
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
}
KERNEL_ATTR_RW(rcu_expedited);
+int rcu_normal;
+static ssize_t rcu_normal_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+}
+static ssize_t rcu_normal_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (kstrtoint(buf, 0, &rcu_normal))
+ return -EINVAL;
+
+ return count;
+}
+KERNEL_ATTR_RW(rcu_normal);
+#endif /* #ifndef CONFIG_TINY_RCU */
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
&kexec_crash_size_attr.attr,
&vmcoreinfo_attr.attr,
#endif
+#ifndef CONFIG_TINY_RCU
&rcu_expedited_attr.attr,
+ &rcu_normal_attr.attr,
+#endif
NULL
};
node->cpu = curr;
/*
- * ACQUIRE semantics, pairs with corresponding RELEASE
- * in unlock() uncontended, or fastpath.
+ * We need both ACQUIRE (pairs with corresponding RELEASE in
+ * unlock() uncontended, or fastpath) and RELEASE (to publish
+ * the node fields we just initialised) semantics when updating
+ * the lock tail.
*/
- old = atomic_xchg_acquire(&lock->tail, curr);
+ old = atomic_xchg(&lock->tail, curr);
if (old == OSQ_UNLOCKED_VAL)
return true;
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
*
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
* Peter Zijlstra <peterz@infradead.org>
*/
{
struct __qspinlock *l = (void *)lock;
- return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+ /*
+ * Use release semantics to make sure that the MCS node is properly
+ * initialized before changing the tail code.
+ */
+ return (u32)xchg_release(&l->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
}
#else /* _Q_PENDING_BITS == 8 */
for (;;) {
new = (val & _Q_LOCKED_PENDING_MASK) | tail;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Use release semantics to make sure that the MCS node is
+ * properly initialized before changing the tail code.
+ */
+ old = atomic_cmpxchg_release(&lock->val, val, new);
if (old == val)
break;
*/
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+ struct mcs_spinlock *prev) { }
static __always_inline void __pv_kick_node(struct qspinlock *lock,
struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
- struct mcs_spinlock *node) { }
+static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+ { return 0; }
#define pv_enabled() false
#define pv_init_node __pv_init_node
#define pv_wait_node __pv_wait_node
#define pv_kick_node __pv_kick_node
-#define pv_wait_head __pv_wait_head
+#define pv_wait_head_or_lock __pv_wait_head_or_lock
#ifdef CONFIG_PARAVIRT_SPINLOCKS
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
if (val == new)
new |= _Q_PENDING_VAL;
- old = atomic_cmpxchg(&lock->val, val, new);
+ /*
+ * Acquire semantic is required here as the function may
+ * return immediately if the lock was free.
+ */
+ old = atomic_cmpxchg_acquire(&lock->val, val, new);
if (old == val)
break;
* p,*,* -> n,*,*
*/
old = xchg_tail(lock, tail);
+ next = NULL;
/*
* if there was a previous node; link it and wait until reaching the
prev = decode_tail(old);
WRITE_ONCE(prev->next, node);
- pv_wait_node(node);
+ pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(&node->locked);
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
}
/*
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
+ * The PV pv_wait_head_or_lock function, if active, will acquire
+ * the lock and return a non-zero value. So we have to skip the
+ * smp_load_acquire() call. As the next PV queue head hasn't been
+ * designated yet, there is no way for the locked value to become
+ * _Q_SLOW_VAL. So both the set_locked() and the
+ * atomic_cmpxchg_relaxed() calls will be safe.
+ *
+ * If PV isn't active, 0 will be returned instead.
+ *
*/
- pv_wait_head(lock, node);
- while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
- cpu_relax();
+ if ((val = pv_wait_head_or_lock(lock, node)))
+ goto locked;
+ smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+
+locked:
/*
* claim the lock:
*
* to grab the lock.
*/
for (;;) {
- if (val != tail) {
+ /* In the PV case we might already have _Q_LOCKED_VAL set */
+ if ((val & _Q_TAIL_MASK) != tail) {
set_locked(lock);
break;
}
- old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+ /*
+ * The smp_load_acquire() call above has provided the necessary
+ * acquire semantics required for locking. At most two
+ * iterations of this loop may be ran.
+ */
+ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
goto release; /* No contention */
}
/*
- * contended path; wait for next, release.
+ * contended path; wait for next if not observed yet, release.
*/
- while (!(next = READ_ONCE(node->next)))
- cpu_relax();
+ if (!next) {
+ while (!(next = READ_ONCE(node->next)))
+ cpu_relax();
+ }
arch_mcs_spin_unlock_contended(&next->locked);
pv_kick_node(lock, next);
#undef pv_init_node
#undef pv_wait_node
#undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
/*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK 0xff
+
+/*
* Queue node uses: vcpu_running & vcpu_halted.
* Queue head uses: vcpu_running & vcpu_hashed.
*/
};
/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+
+ return !READ_ONCE(l->locked) &&
+ (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+ == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+ atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+ int val = atomic_read(&lock->val);
+
+ for (;;) {
+ int old, new;
+
+ if (val & _Q_LOCKED_MASK)
+ break;
+
+ /*
+ * Try to clear pending bit & set locked bit
+ */
+ old = val;
+ new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+ val = atomic_cmpxchg(&lock->val, old, new);
+
+ if (val == old)
+ return 1;
+ }
+ return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
* Lock and MCS node addresses hash table for fast lookup
*
* Hashing is done on a per-cacheline basis to minimize the need to access
{
unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
struct pv_hash_entry *he;
+ int hopcnt = 0;
for_each_hash_entry(he, offset, hash) {
+ hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
+ qstat_hop(hopcnt);
return &he->lock;
}
}
}
/*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+
+ if ((loop & PV_PREV_CHECK_MASK) != 0)
+ return false;
+
+ return READ_ONCE(prev->state) != vcpu_running;
+}
+
+/*
* Initialize the PV part of the mcs_spinlock node.
*/
static void pv_init_node(struct mcs_spinlock *node)
* pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
* behalf.
*/
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct pv_node *pp = (struct pv_node *)prev;
+ int waitcnt = 0;
int loop;
+ bool wait_early;
- for (;;) {
- for (loop = SPIN_THRESHOLD; loop; loop--) {
+ /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+ for (;; waitcnt++) {
+ for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
if (READ_ONCE(node->locked))
return;
+ if (pv_wait_early(pp, loop)) {
+ wait_early = true;
+ break;
+ }
cpu_relax();
}
*/
smp_store_mb(pn->state, vcpu_halted);
- if (!READ_ONCE(node->locked))
+ if (!READ_ONCE(node->locked)) {
+ qstat_inc(qstat_pv_wait_node, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
+ qstat_inc(qstat_pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
+ }
/*
- * If pv_kick_node() changed us to vcpu_hashed, retain that value
- * so that pv_wait_head() knows to not also try to hash this lock.
+ * If pv_kick_node() changed us to vcpu_hashed, retain that
+ * value so that pv_wait_head_or_lock() knows to not also try
+ * to hash this lock.
*/
cmpxchg(&pn->state, vcpu_halted, vcpu_running);
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
}
/*
/*
* Called after setting next->locked = 1 when we're the lock owner.
*
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
*/
static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
}
/*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
* __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
*/
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
struct __qspinlock *l = (void *)lock;
struct qspinlock **lp = NULL;
+ int waitcnt = 0;
int loop;
/*
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
- for (;;) {
+ for (;; waitcnt++) {
+ /*
+ * Set correct vCPU state to be used by queue node wait-early
+ * mechanism.
+ */
+ WRITE_ONCE(pn->state, vcpu_running);
+
+ /*
+ * Set the pending bit in the active lock spinning loop to
+ * disable lock stealing before attempting to acquire the lock.
+ */
+ set_pending(lock);
for (loop = SPIN_THRESHOLD; loop; loop--) {
- if (!READ_ONCE(l->locked))
- return;
+ if (trylock_clear_pending(lock))
+ goto gotlock;
cpu_relax();
}
+ clear_pending(lock);
+
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
*
* Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
- if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+ if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
/*
- * The lock is free and _Q_SLOW_VAL has never
- * been set. Therefore we need to unhash before
- * getting the lock.
+ * The lock was free and now we own the lock.
+ * Change the lock value back to _Q_LOCKED_VAL
+ * and unhash the table.
*/
+ WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
WRITE_ONCE(*lp, NULL);
- return;
+ goto gotlock;
}
}
+ WRITE_ONCE(pn->state, vcpu_halted);
+ qstat_inc(qstat_pv_wait_head, true);
+ qstat_inc(qstat_pv_wait_again, waitcnt);
pv_wait(&l->locked, _Q_SLOW_VAL);
/*
* The unlocker should have freed the lock before kicking the
* CPU. So if the lock is still not free, it is a spurious
- * wakeup and so the vCPU should wait again after spinning for
- * a while.
+ * wakeup or another vCPU has stolen the lock. The current
+ * vCPU should spin again.
*/
+ qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
}
/*
- * Lock is unlocked now; the caller will acquire it without waiting.
- * As with pv_wait_node() we rely on the caller to do a load-acquire
- * for us.
+ * The cmpxchg() or xchg() call before coming here provides the
+ * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+ * here is to indicate to the compiler that the value will always
+ * be nozero to enable better code optimization.
*/
+gotlock:
+ return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
}
/*
- * PV version of the unlock function to be used in stead of
- * queued_spin_unlock().
+ * PV versions of the unlock fastpath and slowpath functions to be used
+ * instead of queued_spin_unlock().
*/
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 locked;
-
- /*
- * We must not unlock if SLOW, because in that case we must first
- * unhash. Otherwise it would be possible to have multiple @lock
- * entries, which would be BAD.
- */
- locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
- if (likely(locked == _Q_LOCKED_VAL))
- return;
if (unlikely(locked != _Q_SLOW_VAL)) {
WARN(!debug_locks_silent,
* so we need a barrier to order the read of the node data in
* pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
*
- * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
*/
smp_rmb();
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
+ qstat_inc(qstat_pv_kick_unlock, true);
pv_kick(node->cpu);
}
+
/*
* Include the architecture specific callee-save thunk of the
* __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
- * that the callee-save thunk and the real unlock function are close
- * to each other sharing consecutive instruction cachelines.
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
+ * function close to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
*/
#include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ struct __qspinlock *l = (void *)lock;
+ u8 locked;
+
+ /*
+ * We must not unlock if SLOW, because in that case we must first
+ * unhash. Otherwise it would be possible to have multiple @lock
+ * entries, which would be BAD.
+ */
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
--- /dev/null
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ * pv_hash_hops - average # of hops per hashing operation
+ * pv_kick_unlock - # of vCPU kicks issued at unlock time
+ * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
+ * pv_latency_kick - average latency (ns) of vCPU kick operation
+ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
+ * pv_lock_stealing - # of lock stealing operations
+ * pv_spurious_wakeup - # of spurious wakeups
+ * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
+ * pv_wait_early - # of early vCPU wait's
+ * pv_wait_head - # of vCPU wait's at the queue head
+ * pv_wait_node - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+ qstat_pv_hash_hops,
+ qstat_pv_kick_unlock,
+ qstat_pv_kick_wake,
+ qstat_pv_latency_kick,
+ qstat_pv_latency_wake,
+ qstat_pv_lock_stealing,
+ qstat_pv_spurious_wakeup,
+ qstat_pv_wait_again,
+ qstat_pv_wait_early,
+ qstat_pv_wait_head,
+ qstat_pv_wait_node,
+ qstat_num, /* Total number of statistical counters */
+ qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+ [qstat_pv_hash_hops] = "pv_hash_hops",
+ [qstat_pv_kick_unlock] = "pv_kick_unlock",
+ [qstat_pv_kick_wake] = "pv_kick_wake",
+ [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+ [qstat_pv_latency_kick] = "pv_latency_kick",
+ [qstat_pv_latency_wake] = "pv_latency_wake",
+ [qstat_pv_lock_stealing] = "pv_lock_stealing",
+ [qstat_pv_wait_again] = "pv_wait_again",
+ [qstat_pv_wait_early] = "pv_wait_early",
+ [qstat_pv_wait_head] = "pv_wait_head",
+ [qstat_pv_wait_node] = "pv_wait_node",
+ [qstat_reset_cnts] = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ * Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, counter, len;
+ u64 stat = 0, kicks = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ counter = (long)(file->f_inode->i_private);
+
+ if (counter >= qstat_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu) {
+ stat += per_cpu(qstats[counter], cpu);
+ /*
+ * Need to sum additional counter for some of them
+ */
+ switch (counter) {
+
+ case qstat_pv_latency_kick:
+ case qstat_pv_hash_hops:
+ kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ break;
+
+ case qstat_pv_latency_wake:
+ kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ break;
+ }
+ }
+
+ if (counter == qstat_pv_hash_hops) {
+ u64 frac;
+
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+ /*
+ * Return a X.XX decimal number
+ */
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ } else {
+ /*
+ * Round to the nearest ns
+ */
+ if ((counter == qstat_pv_latency_kick) ||
+ (counter == qstat_pv_latency_wake)) {
+ stat = 0;
+ if (kicks)
+ stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ }
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ }
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if (!file->f_inode) {
+ WARN_ON_ONCE(1);
+ return -EBADF;
+ }
+ if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ for (i = 0 ; i < qstat_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+ .read = qstat_read,
+ .write = qstat_write,
+ .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+ struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+ int i;
+
+ if (!d_qstat) {
+ pr_warn("Could not create 'qlockstat' debugfs directory\n");
+ return 0;
+ }
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < qstat_num; i++)
+ debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat);
+
+ debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat);
+ return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+ if (cond)
+ this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+ this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+ u64 start = sched_clock();
+
+ per_cpu(pv_kick_time, cpu) = start;
+ pv_kick(cpu);
+ this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+ u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+ *pkick_time = 0;
+ pv_wait(ptr, val);
+ if (*pkick_time) {
+ this_cpu_add(qstats[qstat_pv_latency_wake],
+ sched_clock() - *pkick_time);
+ qstat_inc(qstat_pv_kick_wake, true);
+ }
+}
+
+#define pv_kick(c) __pv_kick(c)
+#define pv_wait(p, v) __pv_wait(p, v)
+
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+ int ret = pv_queued_spin_steal_lock(lock);
+
+ qstat_inc(qstat_pv_lock_stealing, ret);
+ return ret;
+}
+#undef queued_spin_trylock
+#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt) { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */
synchronize_sched();
mutex_unlock(&module_mutex);
free_module:
+ /*
+ * Ftrace needs to clean up what it initialized.
+ * This does nothing if ftrace_module_init() wasn't called,
+ * but it must be called outside of module_mutex.
+ */
+ ftrace_release_mod(mod);
/* Free lock-classes; relies on the preceding sync_rcu() */
lockdep_free_key_range(mod->module_core, mod->core_size);
#define RTWS_SYNC 7
#define RTWS_STUTTER 8
#define RTWS_STOPPING 9
+static const char * const rcu_torture_writer_state_names[] = {
+ "RTWS_FIXED_DELAY",
+ "RTWS_DELAY",
+ "RTWS_REPLACE",
+ "RTWS_DEF_FREE",
+ "RTWS_EXP_SYNC",
+ "RTWS_COND_GET",
+ "RTWS_COND_SYNC",
+ "RTWS_SYNC",
+ "RTWS_STUTTER",
+ "RTWS_STOPPING",
+};
+
+static const char *rcu_torture_writer_state_getname(void)
+{
+ unsigned int i = READ_ONCE(rcu_torture_writer_state);
+
+ if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
+ return "???";
+ return rcu_torture_writer_state_names[i];
+}
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
rcutorture_get_gp_data(cur_ops->ttype,
&flags, &gpnum, &completed);
- pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
+ rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags);
show_rcu_gp_kthreads();
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, rcu_gp_is_expedited()
+ __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
: SYNCHRONIZE_SRCU_TRYCOUNT);
}
/* Data structures. */
-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
-
/*
* In order to export the rcu_state name to the tracing tools, it
* needs to be added in the __tracepoint_string section.
*/
void rcu_sched_qs(void)
{
- unsigned long flags;
-
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
- trace_rcu_grace_period(TPS("rcu_sched"),
- __this_cpu_read(rcu_sched_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- local_irq_save(flags);
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data),
- true);
- }
- local_irq_restore(flags);
- }
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+ return;
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
}
void rcu_bh_qs(void)
* We inform the RCU core by emulating a zero-duration dyntick-idle
* period, which we in turn do by incrementing the ->dynticks counter
* by two.
+ *
+ * The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
- local_irq_save(flags);
-
/*
* Yes, we can lose flag-setting operations. This is OK, because
* the flag will be set again after some delay.
smp_mb__after_atomic(); /* Later stuff after QS. */
break;
}
- local_irq_restore(flags);
}
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
*/
void rcu_note_context_switch(void)
{
*/
void rcu_all_qs(void)
{
+ unsigned long flags;
+
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ local_irq_save(flags);
rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
* The caller must have disabled interrupts to prevent races with
* normal callback registry.
*/
-static int
+static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
int i;
if (rcu_gp_in_progress(rsp))
- return 0; /* No, a grace period is already in progress. */
+ return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
- return 1; /* Yes, a no-CBs CPU needs one. */
+ return true; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
- return 0; /* No, this is a no-CBs (or offline) CPU. */
+ return false; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
- return 1; /* Yes, this CPU has newly registered callbacks. */
+ return true; /* Yes, CPU has newly registered callbacks. */
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
ULONG_CMP_LT(READ_ONCE(rsp->completed),
rdp->nxtcompleted[i]))
- return 1; /* Yes, CBs for future grace period. */
- return 0; /* No grace period needed. */
+ return true; /* Yes, CBs for future grace period. */
+ return false; /* No grace period needed. */
}
/*
*
* Exit from an interrupt handler, which might possibly result in entering
* idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* This code assumes that the idle loop never does anything that might
* result in unbalanced calls to irq_enter() and irq_exit(). If your
*/
void rcu_irq_exit(void)
{
- unsigned long flags;
long long oldval;
struct rcu_dynticks *rdtp;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
else
rcu_eqs_enter_common(oldval, true);
rcu_sysidle_enter(1);
+}
+
+/*
+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ */
+void rcu_irq_exit_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_exit();
local_irq_restore(flags);
}
*
* Enter an interrupt handler, which might possibly result in exiting
* idle mode, in other words, entering the mode in which read-side critical
- * sections can occur.
+ * sections can occur. The caller must have disabled interrupts.
*
* Note that the Linux kernel is fully capable of entering an interrupt
* handler that it never exits, for example when doing upcalls to
*/
void rcu_irq_enter(void)
{
- unsigned long flags;
struct rcu_dynticks *rdtp;
long long oldval;
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
else
rcu_eqs_exit_common(oldval, true);
rcu_sysidle_exit(1);
+}
+
+/*
+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ */
+void rcu_irq_enter_irqson(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_irq_enter();
local_irq_restore(flags);
}
}
/*
+ * Convert a ->gp_state value to a character string.
+ */
+static const char *gp_state_getname(short gs)
+{
+ if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
+ return "???";
+ return gp_state_names[gs];
+}
+
+/*
* Complain about starvation of grace-period kthread.
*/
static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
- if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
+ if (j - gpa > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
- rsp->gp_flags, rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : 0);
+ rsp->gp_flags,
+ gp_state_getname(rsp->gp_state), rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ if (rsp->gp_kthread)
+ sched_show_task(rsp->gp_kthread);
+ }
}
/*
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
if (rnp->qsmask & (1UL << cpu))
/* Only let one CPU complain about others per time interval. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rsp->name);
print_cpu_stall_info_begin();
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
rcu_dump_cpu_stacks(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
* hold it, acquire the root rcu_node structure's lock in order to
* start one (if needed).
*/
- if (rnp != rnp_root) {
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
- }
+ if (rnp != rnp_root)
+ raw_spin_lock_rcu_node(rnp_root);
/*
* Get a new grace-period number. If there really is no grace
if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
rdp->completed == READ_ONCE(rnp->completed) &&
!unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
- !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
+ !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
local_irq_restore(flags);
return;
}
- smp_mb__after_unlock_lock();
needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (needwake)
}
/*
- * Initialize a new grace period. Return 0 if no grace period required.
+ * Initialize a new grace period. Return false if no grace period required.
*/
-static int rcu_gp_init(struct rcu_state *rsp)
+static bool rcu_gp_init(struct rcu_state *rsp)
{
unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ return false;
}
WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
* Not supposed to be able to happen.
*/
raw_spin_unlock_irq(&rnp->lock);
- return 0;
+ return false;
}
/* Advance to a new grace period and initialize state. */
*/
rcu_for_each_leaf_node(rsp, rnp) {
rcu_gp_slow(rsp, gp_preinit_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
rcu_gp_slow(rsp, gp_init_delay);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
WRITE_ONCE(rsp->gp_activity, jiffies);
}
- return 1;
+ return true;
}
/*
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WRITE_ONCE(rsp->gp_flags,
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
* grace period is recorded in any of the rcu_node structures.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irq_rcu_node(rnp);
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->completed, rsp->gpnum);
rcu_gp_slow(rsp, gp_cleanup_delay);
}
rnp = rcu_get_root(rsp);
- raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
+ raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
rcu_nocb_gp_set(rnp, nocb);
/* Declare grace period done. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rnp_c = rnp;
rnp = rnp->parent;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
oldmask = rnp_c->qsmask;
}
gps = rnp->gpnum;
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
}
struct rcu_node *rnp;
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if ((rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
rnp = rnp->parent;
if (!rnp)
break;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock(); /* GP memory ordering. */
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinit &= ~mask;
rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rnp->qsmaskinitnext &= ~mask;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
rcu_for_each_leaf_node(rsp, rnp) {
cond_resched_rcu_qs();
mask = 0;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
if (rcu_state_p == &rcu_sched_state ||
rsp != rcu_state_p ||
/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
/* Reached the root of the rcu_node tree, acquire lock. */
- raw_spin_lock_irqsave(&rnp_old->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
/* Does this CPU require a not-yet-started grace period? */
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
- raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
+ raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
if (needwake)
if (!rcu_gp_in_progress(rsp)) {
struct rcu_node *rnp_root = rcu_get_root(rsp);
- raw_spin_lock(&rnp_root->lock);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp_root);
needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
if (needwake)
{
unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
s = (READ_ONCE(*sp) + 3) & ~0x1;
smp_mb(); /* Above access must not bleed into critical section. */
return s;
}
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
return rcu_seq_snap(&rsp->expedited_sequence);
}
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
* CPUs for the current rcu_node structure up the rcu_node tree.
*/
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmaskinit == rnp->expmaskinitnext) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
continue; /* No new CPUs, nothing to do. */
rnp_up = rnp->parent;
done = false;
while (rnp_up) {
- raw_spin_lock_irqsave(&rnp_up->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
if (rnp_up->expmaskinit)
done = true;
rnp_up->expmaskinit |= mask;
sync_exp_reset_tree_hotplug(rsp);
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
rnp->expmask = rnp->expmaskinit;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
rnp->expmask &= ~mask;
}
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
__rcu_report_exp_rnp(rsp, rnp, wake, flags);
}
{
unsigned long flags;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
*/
static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
{
- struct rcu_data *rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
struct rcu_node *rnp0;
struct rcu_node *rnp1 = NULL;
if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
if (sync_exp_work_done(rsp, rnp0, NULL,
- &rsp->expedited_workdone0, s))
+ &rdp->expedited_workdone0, s))
return NULL;
return rnp0;
}
* can be inexact, as it is just promoting locality and is not
* strictly needed for correctness.
*/
- rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
return NULL;
mutex_lock(&rdp->exp_funnel_mutex);
rnp0 = rdp->mynode;
for (; rnp0 != NULL; rnp0 = rnp0->parent) {
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone2, s))
+ &rdp->expedited_workdone2, s))
return NULL;
mutex_lock(&rnp0->exp_funnel_mutex);
if (rnp1)
rnp1 = rnp0;
}
if (sync_exp_work_done(rsp, rnp1, rdp,
- &rsp->expedited_workdone3, s))
+ &rdp->expedited_workdone3, s))
return NULL;
return rnp1;
}
sync_exp_reset_tree(rsp);
rcu_for_each_leaf_node(rsp, rnp) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
mask_ofl_test = 0;
ret = smp_call_function_single(cpu, func, rsp, 0);
if (!ret) {
mask_ofl_ipi &= ~mask;
- } else {
- /* Failed, raced with offline. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask)) {
- raw_spin_unlock_irqrestore(&rnp->lock,
- flags);
- schedule_timeout_uninterruptible(1);
- if (cpu_online(cpu) &&
- (rnp->expmask & mask))
- goto retry_ipi;
- raw_spin_lock_irqsave(&rnp->lock,
- flags);
- }
- if (!(rnp->expmask & mask))
- mask_ofl_ipi &= ~mask;
+ continue;
+ }
+ /* Failed, raced with offline. */
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ schedule_timeout_uninterruptible(1);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask))
+ goto retry_ipi;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/* Report quiescent states for those that went offline. */
mask_ofl_test |= mask_ofl_ipi;
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
+ int ndetected;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root(rsp);
int ret;
rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall);
- if (ret > 0)
+ if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
return;
if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */
}
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rsp->name);
+ ndetected = 0;
rcu_for_each_leaf_node(rsp, rnp) {
- (void)rcu_print_task_exp_stall(rnp);
+ ndetected = rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
if (!(rnp->expmask & mask))
continue;
+ ndetected++;
rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
"O."[cpu_online(cpu)],
}
mask <<= 1;
}
- pr_cont(" } %lu jiffies s: %lu\n",
- jiffies - jiffies_start, rsp->expedited_sequence);
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ jiffies - jiffies_start, rsp->expedited_sequence,
+ rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
+ if (!ndetected) {
+ pr_err("blocking rcu_node structures:");
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_preempt_exp_done(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi,
+ rnp->expmask,
+ ".T"[!!rnp->exp_tasks]);
+ }
+ pr_cont("\n");
+ }
rcu_for_each_leaf_node(rsp, rnp) {
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
+ /* If only one CPU, this is automatically a grace period. */
+ if (rcu_blocking_is_gp())
+ return;
+
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu_sched);
+ return;
+ }
+
/* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp);
rnp = rnp->parent;
if (rnp == NULL)
return;
- raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+ raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
rnp->qsmaskinit |= mask;
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
}
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
struct rcu_node *rnp = rcu_get_root(rsp);
/* Set up local state, ensuring consistent view of global state. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
*/
rnp = rdp->mynode;
mask = rdp->grpmask;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
rnp->qsmaskinitnext |= mask;
rnp->expmaskinitnext |= mask;
if (!rdp->beenonline)
t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
BUG_ON(IS_ERR(t));
rnp = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rsp->gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
- wake_up_process(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ wake_up_process(t);
}
rcu_spawn_nocb_kthreads();
rcu_spawn_boost_kthreads();
/*
* Helper function for rcu_init() that initializes one rcu_state structure.
*/
-static void __init rcu_init_one(struct rcu_state *rsp,
- struct rcu_data __percpu *rda)
+static void __init rcu_init_one(struct rcu_state *rsp)
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
static const char * const exp[] = RCU_EXP_NAME_INIT;
+ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+ static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
rcu_bootup_announce();
rcu_init_geometry();
- rcu_init_one(&rcu_bh_state, &rcu_bh_data);
- rcu_init_one(&rcu_sched_state, &rcu_sched_data);
+ rcu_init_one(&rcu_bh_state);
+ rcu_init_one(&rcu_sched_state);
if (dump_tree)
rcu_dump_rcu_node_tree(&rcu_sched_state);
__rcu_init_preempt();
/* beginning of each expedited GP. */
unsigned long expmaskinitnext;
/* Online CPUs for next expedited GP. */
+ /* Any CPU that has ever been online will */
+ /* have its bit set. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
struct mutex exp_funnel_mutex;
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
+ atomic_long_t expedited_workdone1; /* # done by others #1. */
+ atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
/* End of fields guarded by barrier_mutex. */
unsigned long expedited_sequence; /* Take a ticket. */
- atomic_long_t expedited_workdone0; /* # done by others #0. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
wait_queue_head_t expedited_wq; /* Wait for check-ins. */
#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
+#ifndef RCU_TREE_NONCORE
+static const char * const gp_state_names[] = {
+ "RCU_GP_IDLE",
+ "RCU_GP_WAIT_GPS",
+ "RCU_GP_DONE_GPS",
+ "RCU_GP_WAIT_FQS",
+ "RCU_GP_DOING_FQS",
+ "RCU_GP_CLEANUP",
+ "RCU_GP_CLEANED",
+};
+#endif /* #ifndef RCU_TREE_NONCORE */
+
extern struct list_head rcu_struct_flavors;
/* Sequence through rcu_state structures for each RCU flavor. */
#else /* #ifdef CONFIG_PPC */
#define smp_mb__after_unlock_lock() do { } while (0)
#endif /* #else #ifdef CONFIG_PPC */
+
+/*
+ * Wrappers for the rcu_node::lock acquire.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ */
+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock(&rnp->lock);
+ smp_mb__after_unlock_lock();
+}
+
+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
+{
+ raw_spin_lock_irq(&rnp->lock);
+ smp_mb__after_unlock_lock();
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
+do { \
+ typecheck(unsigned long, flags); \
+ raw_spin_lock_irqsave(&(rnp)->lock, flags); \
+ smp_mb__after_unlock_lock(); \
+} while (0)
+
+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
+{
+ bool locked = raw_spin_trylock(&rnp->lock);
+
+ if (locked)
+ smp_mb__after_unlock_lock();
+ return locked;
+}
/*
* Check the RCU kernel configuration parameters and print informative
- * messages about anything out of the ordinary. If you like #ifdef, you
- * will love this function.
+ * messages about anything out of the ordinary.
*/
static void __init rcu_bootup_announce_oddness(void)
{
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+ __releases(rnp->lock) /* But leaves rrupts disabled. */
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
- raw_spin_unlock(&rnp->lock);
+ raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
- local_irq_restore(flags);
}
/*
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
- * Caller must disable preemption.
+ * Caller must disable interrupts.
*/
static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_node *rnp;
/* Possibly blocking in an RCU read-side critical section. */
rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp);
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
- rcu_preempt_ctxt_queue(rnp, rdp, flags);
+ rcu_preempt_ctxt_queue(rnp, rdp);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
/*
* Remove this task from the list it blocked on. The task
- * now remains queued on the rcu_node corresponding to
- * the CPU it first blocked on, so the first attempt to
- * acquire the task's rcu_node's ->lock will succeed.
- * Keep the loop and add a WARN_ON() out of sheer paranoia.
+ * now remains queued on the rcu_node corresponding to the
+ * CPU it first blocked on, so there is no longer any need
+ * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
*/
- for (;;) {
- rnp = t->rcu_blocked_node;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
- if (rnp == t->rcu_blocked_node)
- break;
- WARN_ON_ONCE(1);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
+ rnp = t->rcu_blocked_node;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ WARN_ON_ONCE(rnp != t->rcu_blocked_node);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
unsigned long flags;
struct task_struct *t;
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
s = rcu_exp_gp_seq_snap(rsp);
rnp_unlock = exp_funnel_lock(rsp, s);
*/
static void __init __rcu_init_preempt(void)
{
- rcu_init_one(rcu_state_p, rcu_data_p);
+ rcu_init_one(rcu_state_p);
}
/*
READ_ONCE(rnp->boost_tasks) == NULL)
return 0; /* Nothing left to boost. */
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
/*
* Recheck under the lock: all tasks in need of boosting
"rcub/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
sp.sched_priority = kthread_prio;
struct rcu_state *rsp;
int tne;
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
+ rcu_is_nocb_cpu(smp_processor_id()))
return;
/* Handle nohz enablement switches conservatively. */
if (!tne)
return;
- /* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(smp_processor_id()))
- return;
-
/*
* If a non-lazy callback arrived at a CPU having only lazy
* callbacks, invoke RCU core for the side-effect of recalculating
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- smp_mb__after_unlock_lock();
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
if (needwake)
bool needwake;
struct rcu_node *rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (needwake)
/*
- * Read-Copy Update tracing for classic implementation
+ * Read-Copy Update tracing for hierarchical implementation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
+ * Author: Paul E. McKenney
*
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
#include <linux/sched.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
-#include <linux/module.h>
#include <linux/completion.h>
-#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
static int show_rcuexp(struct seq_file *m, void *v)
{
+ int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
-
+ struct rcu_data *rdp;
+ unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ s0 += atomic_long_read(&rdp->expedited_workdone0);
+ s1 += atomic_long_read(&rdp->expedited_workdone1);
+ s2 += atomic_long_read(&rdp->expedited_workdone2);
+ s3 += atomic_long_read(&rdp->expedited_workdone3);
+ }
seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence,
- atomic_long_read(&rsp->expedited_workdone0),
- atomic_long_read(&rsp->expedited_workdone1),
- atomic_long_read(&rsp->expedited_workdone2),
- atomic_long_read(&rsp->expedited_workdone3),
+ rsp->expedited_sequence, s0, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
unsigned long gpmax;
struct rcu_node *rnp = &rsp->node[0];
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
completed = READ_ONCE(rsp->completed);
gpnum = READ_ONCE(rsp->gpnum);
if (completed == gpnum)
debugfs_remove_recursive(rcudir);
return 1;
}
-
-static void __exit rcutree_trace_cleanup(void)
-{
- debugfs_remove_recursive(rcudir);
-}
-
-
-module_init(rcutree_trace_init);
-module_exit(rcutree_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutree_trace_init);
#endif
#define MODULE_PARAM_PREFIX "rcupdate."
+#ifndef CONFIG_TINY_RCU
module_param(rcu_expedited, int, 0);
+module_param(rcu_normal, int, 0);
+static int rcu_normal_after_boot;
+module_param(rcu_normal_after_boot, int, 0);
+#endif /* #ifndef CONFIG_TINY_RCU */
#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
/**
#ifndef CONFIG_TINY_RCU
+/*
+ * Should expedited grace-period primitives always fall back to their
+ * non-expedited counterparts? Intended for use within RCU. Note
+ * that if the user specifies both rcu_expedited and rcu_normal, then
+ * rcu_normal wins.
+ */
+bool rcu_gp_is_normal(void)
+{
+ return READ_ONCE(rcu_normal);
+}
+
static atomic_t rcu_expedited_nesting =
ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
}
EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
-#endif /* #ifndef CONFIG_TINY_RCU */
-
/*
* Inform RCU of the end of the in-kernel boot sequence.
*/
{
if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
rcu_unexpedite_gp();
+ if (rcu_normal_after_boot)
+ WRITE_ONCE(rcu_normal, 1);
}
+#endif /* #ifndef CONFIG_TINY_RCU */
+
#ifdef CONFIG_PREEMPT_RCU
/*
raw_spin_unlock(&rq->lock);
}
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ * MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ * A) UNLOCK of the rq(c0)->lock scheduling out task t
+ * B) migration for t is required to synchronize *both* rq(c0)->lock and
+ * rq(c1)->lock (if not at the same time, then in that order).
+ * C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ * CPU0 CPU1 CPU2
+ *
+ * LOCK rq(0)->lock
+ * sched-out X
+ * sched-in Y
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(0)->lock // orders against CPU0
+ * dequeue X
+ * UNLOCK rq(0)->lock
+ *
+ * LOCK rq(1)->lock
+ * enqueue X
+ * UNLOCK rq(1)->lock
+ *
+ * LOCK rq(1)->lock // orders against CPU2
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(1)->lock
+ *
+ *
+ * BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ * 1) smp_store_release(X->on_cpu, 0)
+ * 2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ * LOCK rq(0)->lock LOCK X->pi_lock
+ * dequeue X
+ * sched-out X
+ * smp_store_release(X->on_cpu, 0);
+ *
+ * smp_cond_acquire(!X->on_cpu);
+ * X->state = WAKING
+ * set_task_cpu(X,2)
+ *
+ * LOCK rq(2)->lock
+ * enqueue X
+ * X->state = RUNNING
+ * UNLOCK rq(2)->lock
+ *
+ * LOCK rq(2)->lock // orders against CPU1
+ * sched-out Z
+ * sched-in X
+ * UNLOCK rq(2)->lock
+ *
+ * UNLOCK X->pi_lock
+ * UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
+
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
- */
- while (p->on_cpu)
- cpu_relax();
- /*
- * Combined with the control dependency above, we have an effective
- * smp_load_acquire() without the need for full barriers.
*
* Pairs with the smp_store_release() in finish_lock_switch().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_rmb();
+ smp_cond_acquire(!p->on_cpu);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
removed = 1;
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the control dependency and rmb in try_to_wake_up().
+ * Pairs with the smp_cond_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
if (*pos < last_index + start_index)
return __start___tracepoint_str + (*pos - last_index);
+ start_index += last_index;
return find_next_mod_format(start_index, v, fmt, pos);
}
(unsigned long long)r); \
} while (0)
+/*
+ * Test for a atomic operation family,
+ * @test should be a macro accepting parameters (bit, op, ...)
+ */
+
+#define FAMILY_TEST(test, bit, op, args...) \
+do { \
+ test(bit, op, ##args); \
+ test(bit, op##_acquire, ##args); \
+ test(bit, op##_release, ##args); \
+ test(bit, op##_relaxed, ##args); \
+} while (0)
+
+#define TEST_RETURN(bit, op, c_op, val) \
+do { \
+ atomic##bit##_set(&v, v0); \
+ r = v0; \
+ r c_op val; \
+ BUG_ON(atomic##bit##_##op(val, &v) != r); \
+ BUG_ON(atomic##bit##_read(&v) != r); \
+} while (0)
+
+#define RETURN_FAMILY_TEST(bit, op, c_op, val) \
+do { \
+ FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \
+} while (0)
+
+#define TEST_ARGS(bit, op, init, ret, expect, args...) \
+do { \
+ atomic##bit##_set(&v, init); \
+ BUG_ON(atomic##bit##_##op(&v, ##args) != ret); \
+ BUG_ON(atomic##bit##_read(&v) != expect); \
+} while (0)
+
+#define XCHG_FAMILY_TEST(bit, init, new) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new); \
+} while (0)
+
+#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
+ init, init, new, init, new); \
+ FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
+ init, init, init, wrong, new); \
+} while (0)
+
+#define INC_RETURN_FAMILY_TEST(bit, i) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, inc_return, \
+ i, (i) + one, (i) + one); \
+} while (0)
+
+#define DEC_RETURN_FAMILY_TEST(bit, i) \
+do { \
+ FAMILY_TEST(TEST_ARGS, bit, dec_return, \
+ i, (i) - one, (i) - one); \
+} while (0)
+
static __init void test_atomic(void)
{
int v0 = 0xaaa31337;
TEST(, and, &=, v1);
TEST(, xor, ^=, v1);
TEST(, andnot, &= ~, v1);
+
+ RETURN_FAMILY_TEST(, add_return, +=, onestwos);
+ RETURN_FAMILY_TEST(, add_return, +=, -one);
+ RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
+ RETURN_FAMILY_TEST(, sub_return, -=, -one);
+
+ INC_RETURN_FAMILY_TEST(, v0);
+ DEC_RETURN_FAMILY_TEST(, v0);
+
+ XCHG_FAMILY_TEST(, v0, v1);
+ CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
+
}
#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
TEST(64, xor, ^=, v1);
TEST(64, andnot, &= ~, v1);
- INIT(v0);
- r += onestwos;
- BUG_ON(atomic64_add_return(onestwos, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r += -one;
- BUG_ON(atomic64_add_return(-one, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r -= onestwos;
- BUG_ON(atomic64_sub_return(onestwos, &v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- r -= -one;
- BUG_ON(atomic64_sub_return(-one, &v) != r);
- BUG_ON(v.counter != r);
+ RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
+ RETURN_FAMILY_TEST(64, add_return, +=, -one);
+ RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
+ RETURN_FAMILY_TEST(64, sub_return, -=, -one);
INIT(v0);
atomic64_inc(&v);
BUG_ON(v.counter != r);
INIT(v0);
- r += one;
- BUG_ON(atomic64_inc_return(&v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
atomic64_dec(&v);
r -= one;
BUG_ON(v.counter != r);
- INIT(v0);
- r -= one;
- BUG_ON(atomic64_dec_return(&v) != r);
- BUG_ON(v.counter != r);
-
- INIT(v0);
- BUG_ON(atomic64_xchg(&v, v1) != v0);
- r = v1;
- BUG_ON(v.counter != r);
-
- INIT(v0);
- BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
- r = v1;
- BUG_ON(v.counter != r);
+ INC_RETURN_FAMILY_TEST(64, v0);
+ DEC_RETURN_FAMILY_TEST(64, v0);
- INIT(v0);
- BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
- BUG_ON(v.counter != r);
+ XCHG_FAMILY_TEST(64, v0, v1);
+ CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
INIT(v0);
BUG_ON(atomic64_add_unless(&v, one, v0));
entry->type = dma_debug_coherent;
entry->dev = dev;
entry->pfn = page_to_pfn(virt_to_page(virt));
- entry->offset = (size_t) virt & PAGE_MASK;
+ entry->offset = (size_t) virt & ~PAGE_MASK;
entry->size = size;
entry->dev_addr = dma_addr;
entry->direction = DMA_BIDIRECTIONAL;
.type = dma_debug_coherent,
.dev = dev,
.pfn = page_to_pfn(virt_to_page(virt)),
- .offset = (size_t) virt & PAGE_MASK,
+ .offset = (size_t) virt & ~PAGE_MASK,
.dev_addr = addr,
.size = size,
.direction = DMA_BIDIRECTIONAL,
next->prev = new;
new->next = next;
new->prev = prev;
- prev->next = new;
+ WRITE_ONCE(prev->next, new);
}
EXPORT_SYMBOL(__list_add);
return false;
}
-int rhashtable_insert_rehash(struct rhashtable *ht)
+int rhashtable_insert_rehash(struct rhashtable *ht,
+ struct bucket_table *tbl)
{
struct bucket_table *old_tbl;
struct bucket_table *new_tbl;
- struct bucket_table *tbl;
unsigned int size;
int err;
old_tbl = rht_dereference_rcu(ht->tbl, ht);
- tbl = rhashtable_last_table(ht, old_tbl);
size = tbl->size;
+ err = -EBUSY;
+
if (rht_grow_above_75(ht, tbl))
size *= 2;
/* Do not schedule more than one rehash */
else if (old_tbl != tbl)
- return -EBUSY;
+ goto fail;
+
+ err = -ENOMEM;
new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
- if (new_tbl == NULL) {
- /* Schedule async resize/rehash to try allocation
- * non-atomic context.
- */
- schedule_work(&ht->run_work);
- return -ENOMEM;
- }
+ if (new_tbl == NULL)
+ goto fail;
err = rhashtable_rehash_attach(ht, tbl, new_tbl);
if (err) {
schedule_work(&ht->run_work);
return err;
+
+fail:
+ /* Do not fail the insert if someone else did a rehash. */
+ if (likely(rcu_dereference_raw(tbl->future_tbl)))
+ return 0;
+
+ /* Schedule async rehash to retry allocation in process context. */
+ if (err == -ENOMEM)
+ schedule_work(&ht->run_work);
+
+ return err;
}
EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
-int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
- struct rhash_head *obj,
- struct bucket_table *tbl)
+struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
+ const void *key,
+ struct rhash_head *obj,
+ struct bucket_table *tbl)
{
struct rhash_head *head;
unsigned int hash;
exit:
spin_unlock(rht_bucket_lock(tbl, hash));
- return err;
+ if (err == 0)
+ return NULL;
+ else if (err == -EAGAIN)
+ return tbl;
+ else
+ return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
if (!iter->walker)
return -ENOMEM;
- mutex_lock(&ht->mutex);
- iter->walker->tbl = rht_dereference(ht->tbl, ht);
+ spin_lock(&ht->lock);
+ iter->walker->tbl =
+ rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
list_add(&iter->walker->list, &iter->walker->tbl->walkers);
- mutex_unlock(&ht->mutex);
+ spin_unlock(&ht->lock);
return 0;
}
*/
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
- mutex_lock(&iter->ht->mutex);
+ spin_lock(&iter->ht->lock);
if (iter->walker->tbl)
list_del(&iter->walker->list);
- mutex_unlock(&iter->ht->mutex);
+ spin_unlock(&iter->ht->lock);
kfree(iter->walker);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
{
struct rhashtable *ht = iter->ht;
- mutex_lock(&ht->mutex);
+ rcu_read_lock();
+ spin_lock(&ht->lock);
if (iter->walker->tbl)
list_del(&iter->walker->list);
-
- rcu_read_lock();
-
- mutex_unlock(&ht->mutex);
+ spin_unlock(&ht->lock);
if (!iter->walker->tbl) {
iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
return -EINVAL;
- if (params->nelem_hint)
- size = rounded_hashtable_size(params);
-
memset(ht, 0, sizeof(*ht));
mutex_init(&ht->mutex);
spin_lock_init(&ht->lock);
ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+ if (params->nelem_hint)
+ size = rounded_hashtable_size(&ht->p);
+
/* The maximum (not average) chain length grows with the
* size of the hash table, at a rate of (log N)/(log log N).
* The value of 16 is selected so that even if the hash
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
- do {
+ while (1) {
pos = READ_ONCE(iter->position);
+ if (!pos || css_tryget(&pos->css))
+ break;
/*
- * A racing update may change the position and
- * put the last reference, hence css_tryget(),
- * or retry to see the updated position.
+ * css reference reached zero, so iter->position will
+ * be cleared by ->css_released. However, we should not
+ * rely on this happening soon, because ->css_released
+ * is called from a work queue, and by busy-waiting we
+ * might block it. So we clear iter->position right
+ * away.
*/
- } while (pos && !css_tryget(&pos->css));
+ (void)cmpxchg(&iter->position, pos, NULL);
+ }
}
if (pos)
}
if (reclaim) {
- if (cmpxchg(&iter->position, pos, memcg) == pos) {
- if (memcg)
- css_get(&memcg->css);
- if (pos)
- css_put(&pos->css);
- }
-
/*
- * pairs with css_tryget when dereferencing iter->position
- * above.
+ * The position could have already been updated by a competing
+ * thread, so check that the value hasn't changed since we read
+ * it to avoid reclaiming from the same cgroup twice.
*/
+ (void)cmpxchg(&iter->position, pos, memcg);
+
if (pos)
css_put(&pos->css);
css_put(&prev->css);
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+ int i;
+
+ while ((memcg = parent_mem_cgroup(memcg))) {
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
+ }
+ }
+ }
+ }
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
wb_memcg_offline(memcg);
}
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
+}
+
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pfn;
+ unsigned long pfn, sec_end_pfn;
struct zone *zone = NULL;
struct page *page;
int i;
- for (pfn = start_pfn;
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn);
pfn < end_pfn;
- pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
- if (i == MAX_ORDER_NR_PAGES)
+ pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) {
+ /* Make sure the memory section is present first */
+ if (!present_section_nr(pfn_to_section_nr(pfn)))
continue;
- page = pfn_to_page(pfn + i);
- if (zone && page_zone(page) != zone)
- return 0;
- zone = page_zone(page);
+ for (; pfn < sec_end_pfn && pfn < end_pfn;
+ pfn += MAX_ORDER_NR_PAGES) {
+ i = 0;
+ /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+ while ((i < MAX_ORDER_NR_PAGES) &&
+ !pfn_valid_within(pfn + i))
+ i++;
+ if (i == MAX_ORDER_NR_PAGES)
+ continue;
+ page = pfn_to_page(pfn + i);
+ if (zone && page_zone(page) != zone)
+ return 0;
+ zone = page_zone(page);
+ }
}
return 1;
}
int len;
struct inode *inode;
struct page *page;
- char *kaddr;
struct shmem_inode_info *info;
len = strlen(symname) + 1;
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len);
- kunmap_atomic(kaddr);
+ inode_nohighmem(inode);
+ memcpy(page_address(page), symname, len);
SetPageUptodate(page);
set_page_dirty(page);
unlock_page(page);
return 0;
}
-static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
+static void shmem_put_link(void *arg)
{
- struct page *page = NULL;
- int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
- if (error)
- return ERR_PTR(error);
- unlock_page(page);
- *cookie = page;
- return kmap(page);
+ mark_page_accessed(arg);
+ put_page(arg);
}
-static void shmem_put_link(struct inode *unused, void *cookie)
+static const char *shmem_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct page *page = cookie;
- kunmap(page);
- mark_page_accessed(page);
- page_cache_release(page);
+ struct page *page = NULL;
+ int error;
+ if (!dentry) {
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+ if (error)
+ return ERR_PTR(error);
+ unlock_page(page);
+ }
+ set_delayed_call(done, shmem_put_link, page);
+ return page_address(page);
}
#ifdef CONFIG_TMPFS_XATTR
return 0;
}
-static const struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL
-};
-
-static int shmem_xattr_validate(const char *name)
-{
- struct { const char *prefix; size_t len; } arr[] = {
- { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
- { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- size_t preflen = arr[i].len;
- if (strncmp(name, arr[i].prefix, preflen) == 0) {
- if (!name[preflen])
- return -EINVAL;
- return 0;
- }
- }
- return -EOPNOTSUPP;
-}
-
-static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int shmem_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
-
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+ name = xattr_full_name(handler, name);
return simple_xattr_get(&info->xattrs, name, buffer, size);
}
-static int shmem_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+ name = xattr_full_name(handler, name);
return simple_xattr_set(&info->xattrs, name, value, size, flags);
}
-static int shmem_removexattr(struct dentry *dentry, const char *name)
-{
- struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
+static const struct xattr_handler shmem_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+static const struct xattr_handler shmem_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
- return simple_xattr_remove(&info->xattrs, name);
-}
+static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &shmem_security_xattr_handler,
+ &shmem_trusted_xattr_handler,
+ NULL
+};
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- return simple_xattr_list(&info->xattrs, buffer, size);
+ return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
};
static const struct inode_operations shmem_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = shmem_follow_link,
- .put_link = shmem_put_link,
+ .get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
};
.getattr = shmem_getattr,
.setattr = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
.set_acl = simple_set_acl,
#endif
};
.tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
static const struct inode_operations shmem_special_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
* particular counter cannot be updated from interrupt context.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
* 1 Overstepping half of threshold
* -1 Overstepping minus half of threshold
*/
-static inline void mod_state(struct zone *zone,
- enum zone_stat_item item, int delta, int overstep_mode)
+static inline void mod_state(struct zone *zone, enum zone_stat_item item,
+ long delta, int overstep_mode)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
}
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
mod_state(zone, item, delta, 0);
}
* Use interrupt disable to serialize counter updates
*/
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
unsigned long flags;
BUG();
cpumask_copy(cpu_stat_off, cpu_online_mask);
+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
start_shepherd_timer();
cpu_notifier_register_done();
- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
return last;
}
+/* type and compressor must be null-terminated */
static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
{
struct zswap_pool *pool;
assert_spin_locked(&zswap_pools_lock);
list_for_each_entry_rcu(pool, &zswap_pools, list) {
- if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+ if (strcmp(pool->tfm_name, compressor))
continue;
- if (strncmp(zpool_get_type(pool->zpool), type,
- sizeof(zswap_zpool_type)))
+ if (strcmp(zpool_get_type(pool->zpool), type))
continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_get(pool))
struct sock *sk;
ax25_cb *ax25;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
int select;
batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
struct batadv_dat_candidate *res;
+ struct batadv_dat_entry dat;
if (!bat_priv->orig_hash)
return NULL;
if (!res)
return NULL;
- ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst,
+ dat.ip = ip_dst;
+ dat.vid = 0;
+ ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat,
BATADV_DAT_ADDR_MAX);
batadv_dbg(BATADV_DBG_DAT, bat_priv,
u8 *orig_addr;
struct batadv_orig_node *orig_node = NULL;
int check, hdr_size = sizeof(*unicast_packet);
+ enum batadv_subtype subtype;
bool is4addr;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
/* packet for me */
if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
if (is4addr) {
- batadv_dat_inc_counter(bat_priv,
- unicast_4addr_packet->subtype);
- orig_addr = unicast_4addr_packet->src;
- orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ subtype = unicast_4addr_packet->subtype;
+ batadv_dat_inc_counter(bat_priv, subtype);
+
+ /* Only payload data should be considered for speedy
+ * join. For example, DAT also uses unicast 4addr
+ * types, but those packets should not be considered
+ * for speedy join, since the clients do not actually
+ * reside at the sending originator.
+ */
+ if (subtype == BATADV_P_DATA) {
+ orig_addr = unicast_4addr_packet->src;
+ orig_node = batadv_orig_hash_find(bat_priv,
+ orig_addr);
+ }
}
if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb,
unsigned short vid, const char *message,
bool roaming);
-/* returns 1 if they are the same mac addr */
+/* returns 1 if they are the same mac addr and vid */
static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_tt_common_entry,
hash_entry);
+ const struct batadv_tt_common_entry *tt1 = data1;
+ const struct batadv_tt_common_entry *tt2 = data2;
- return batadv_compare_eth(data1, data2);
+ return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2);
}
/**
}
/* if the client was temporary added before receiving the first
- * OGM announcing it, we have to clear the TEMP flag
+ * OGM announcing it, we have to clear the TEMP flag. Also,
+ * remove the previous temporary orig node and re-add it
+ * if required. If the orig entry changed, the new one which
+ * is a non-temporary entry is preferred.
*/
- common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ if (common->flags & BATADV_TT_CLIENT_TEMP) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ }
/* the change can carry possible "attribute" flags like the
* TT_CLIENT_WIFI, therefore they have to be copied in the
if (!addr || addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_sco))
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
struct switchdev_attr attr = {
.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
- .u.ageing_time = p->br->ageing_time,
+ .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
};
int err;
char *envp[] = { NULL };
struct net_bridge_port *p;
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (net_eq(dev_net(br->dev), &init_net))
+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ else
+ r = -ENOENT;
spin_lock_bh(&br->lock);
{
if (dst) {
int newrefcnt;
+ unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (!newrefcnt && unlikely(dst->flags & DST_NOCACHE))
+ if (!newrefcnt && unlikely(nocache))
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
serr->ee.ee_info = tstype;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
serr->ee.ee_data -= sk->sk_tskey;
}
return NULL;
}
- memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len,
+ memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
2 * ETH_ALEN);
skb->mac_header += VLAN_HLEN;
return skb;
}
}
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-
static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
if (sk->sk_flags & flags) {
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
if (sk->sk_state != TCP_ESTABLISHED) {
ret = -EINVAL;
break;
*/
is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
{
struct sock *sk;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *info;
struct in_device *in_dev;
struct net *net = dev_net(dev);
unsigned int flags;
case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
+ break;
}
return NOTIFY_DONE;
}
u16 type;
struct udp_offload udp_offloads;
struct list_head list;
+ struct rcu_head rcu;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
list_del(&fou->list);
udp_tunnel_sock_release(sock);
- kfree(fou);
+ kfree_rcu(fou, rcu);
}
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
p.i_key = p.o_key = 0;
p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
config NFT_DUP_IPV4
tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (err < 0)
+ goto done;
+ }
if (!inet->hdrincl) {
rfv.msg = msg;
int newly_acked_sacked = prior_unsacked -
(tp->packets_out - tp->sacked_out);
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
+
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
- dst_hold(dst);
+ if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0, copied;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data;
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
- copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
- kfree_skb(syn_data);
- goto fallback;
- }
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
+ if (space) {
+ int copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
}
-
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
fo->data = NULL;
flow_flags,
faddr, saddr, dport, inet->inet_sport);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (err < 0)
+ goto out;
+ }
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm4_dst_ops = {
+static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
+ .dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.decode_session = _decode_session4,
{ }
};
-static int __net_init xfrm4_net_init(struct net *net)
+static int __net_init xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
return -ENOMEM;
}
-static void __net_exit xfrm4_net_exit(struct net *net)
+static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
static struct pernet_operations __net_initdata xfrm4_net_ops = {
.init = xfrm4_net_init,
.exit = xfrm4_net_exit,
};
-#endif
static void __init xfrm4_policy_init(void)
{
void __init xfrm4_init(void)
{
- dst_entries_init(&xfrm4_dst_ops);
-
xfrm4_state_init();
xfrm4_policy_init();
xfrm4_protocol_init();
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
-#endif
}
setup_timer(&ndev->rs_timer, addrconf_rs_timer,
(unsigned long)ndev);
memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+
+ if (ndev->cnf.stable_secret.initialized)
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ else
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+
ndev->cnf.mtu6 = dev->mtu;
ndev->cnf.sysctl = NULL;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if (in6_dev->cnf.optimistic_dad &&
!net->ipv6.devconf_all->forwarding && sllao)
- addr_flags = IFA_F_OPTIMISTIC;
+ addr_flags |= IFA_F_OPTIMISTIC;
#endif
/* Do not allow to create too much of autoconfigured
goto out;
}
- if (!write) {
- err = snprintf(str, sizeof(str), "%pI6",
- &secret->secret);
- if (err >= sizeof(str)) {
- err = -EIO;
- goto out;
- }
+ err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
+ if (err >= sizeof(str)) {
+ err = -EIO;
+ goto out;
}
err = proc_dostring(&lctl, write, buffer, lenp, ppos);
rcu_read_lock();
p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
- if (p && ip6addrlbl_hold(p))
+ if (p && !ip6addrlbl_hold(p))
p = NULL;
lseq = ip6addrlbl_table.seq;
rcu_read_unlock();
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
return -EEXIST;
} else {
t = nt;
-
- ip6gre_tunnel_unlink(ign, t);
- ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
- ip6gre_tunnel_link(ign, t);
- netdev_state_change(dev);
}
+ ip6gre_tunnel_unlink(ign, t);
+ ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link(ign, t);
return 0;
}
*/
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
#ifdef CONFIG_IPV6_ROUTE_INFO
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: router info ignored.\n",
skb->dev->name);
config NFT_DUP_IPV6
tristate "IPv6 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV6
help
This module enables IPv6 packet duplication support for nf_tables.
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
+ if (dst && dst_hold_safe(dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm6_dst_ops = {
+static struct dst_ops xfrm6_dst_ops_template = {
.family = AF_INET6,
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.family = AF_INET6,
- .dst_ops = &xfrm6_dst_ops,
+ .dst_ops = &xfrm6_dst_ops_template,
.dst_lookup = xfrm6_dst_lookup,
.get_saddr = xfrm6_get_saddr,
.decode_session = _decode_session6,
{ }
};
-static int __net_init xfrm6_net_init(struct net *net)
+static int __net_init xfrm6_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
return -ENOMEM;
}
-static void __net_exit xfrm6_net_exit(struct net *net)
+static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm6_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm6_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm6_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template,
+ sizeof(xfrm6_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm6_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm6_net_exit(struct net *net)
+{
+ xfrm6_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+}
static struct pernet_operations xfrm6_net_ops = {
.init = xfrm6_net_init,
.exit = xfrm6_net_exit,
};
-#endif
int __init xfrm6_init(void)
{
int ret;
- dst_entries_init(&xfrm6_dst_ops);
-
ret = xfrm6_policy_init();
- if (ret) {
- dst_entries_destroy(&xfrm6_dst_ops);
+ if (ret)
goto out;
- }
ret = xfrm6_state_init();
if (ret)
goto out_policy;
if (ret)
goto out_state;
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm6_net_ops);
-#endif
out:
return ret;
out_state:
void xfrm6_fini(void)
{
-#ifdef CONFIG_SYSCTL
unregister_pernet_subsys(&xfrm6_net_ops);
-#endif
xfrm6_protocol_fini();
xfrm6_policy_fini();
xfrm6_state_fini();
- dst_entries_destroy(&xfrm6_dst_ops);
}
struct sock *sk;
struct irda_sock *self;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (net != &init_net)
return -EAFNOSUPPORT;
* rc isn't initialized here yet, so ignore it
*/
__ieee80211_vht_handle_opmode(sdata, sta,
- params->opmode_notif,
- band, false);
+ params->opmode_notif, band);
}
if (ieee80211_vif_is_mesh(&sdata->vif))
void ieee80211_sta_set_rx_nss(struct sta_info *sta);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta_vht_cap *vht_cap);
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
*/
if (has_80211h_pwr &&
(!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) {
+ new_ap_level = pwr_level_80211h;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n",
pwr_level_80211h, chan_pwr, pwr_reduction_80211h,
sdata->u.mgd.bssid);
- new_ap_level = pwr_level_80211h;
} else { /* has_cisco_pwr is always true here. */
+ new_ap_level = pwr_level_cisco;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d dBm as advertised by %pM\n",
pwr_level_cisco, sdata->u.mgd.bssid);
- new_ap_level = pwr_level_cisco;
}
- if (sdata->ap_power_level == new_ap_level)
- return 0;
-
sdata->ap_power_level = new_ap_level;
if (__ieee80211_recalc_txpower(sdata))
return BSS_CHANGED_TXPOWER;
if (sta && elems.opmode_notif)
ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif,
- rx_status->band, true);
+ rx_status->band);
mutex_unlock(&local->sta_mtx);
changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt,
opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
- opmode, status->band,
- false);
+ opmode, status->band);
goto handled;
}
default:
drv_stop(local);
}
+static void ieee80211_flush_completed_scan(struct ieee80211_local *local,
+ bool aborted)
+{
+ /* It's possible that we don't handle the scan completion in
+ * time during suspend, so if it's still marked as completed
+ * here, queue the work and flush it to clean things up.
+ * Instead of calling the worker function directly here, we
+ * really queue it to avoid potential races with other flows
+ * scheduling the same work.
+ */
+ if (test_bit(SCAN_COMPLETED, &local->scanning)) {
+ /* If coming from reconfiguration failure, abort the scan so
+ * we don't attempt to continue a partial HW scan - which is
+ * possible otherwise if (e.g.) the 2.4 GHz portion was the
+ * completed scan, and a 5 GHz portion is still pending.
+ */
+ if (aborted)
+ set_bit(SCAN_ABORTED, &local->scanning);
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+ flush_delayed_work(&local->scan_work);
+ }
+}
+
static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
local->suspended = false;
local->in_reconfig = false;
+ ieee80211_flush_completed_scan(local, true);
+
/* scheduled scan clearly can't be running any more, but tell
* cfg80211 and clear local state
*/
mutex_unlock(&local->chanctx_mtx);
}
+static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ /* add STAs back */
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ enum ieee80211_sta_state state;
+
+ if (!sta->uploaded || sta->sdata != sdata)
+ continue;
+
+ for (state = IEEE80211_STA_NOTEXIST;
+ state < sta->sta_state; state++)
+ WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
+ state + 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+}
+
int ieee80211_reconfig(struct ieee80211_local *local)
{
struct ieee80211_hw *hw = &local->hw;
WARN_ON(drv_add_chanctx(local, ctx));
mutex_unlock(&local->chanctx_mtx);
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
- continue;
- ieee80211_assign_chanctx(local, sdata);
- }
-
sdata = rtnl_dereference(local->monitor_sdata);
if (sdata && ieee80211_sdata_running(sdata))
ieee80211_assign_chanctx(local, sdata);
}
- /* add STAs back */
- mutex_lock(&local->sta_mtx);
- list_for_each_entry(sta, &local->sta_list, list) {
- enum ieee80211_sta_state state;
-
- if (!sta->uploaded)
- continue;
-
- /* AP-mode stations will be added later */
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP)
- continue;
-
- for (state = IEEE80211_STA_NOTEXIST;
- state < sta->sta_state; state++)
- WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
- state + 1));
- }
- mutex_unlock(&local->sta_mtx);
-
- /* reconfigure tx conf */
- if (hw->queues >= IEEE80211_NUM_ACS) {
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- !ieee80211_sdata_running(sdata))
- continue;
-
- for (i = 0; i < IEEE80211_NUM_ACS; i++)
- drv_conf_tx(local, sdata, i,
- &sdata->tx_conf[i]);
- }
- }
-
/* reconfigure hardware */
ieee80211_hw_config(local, ~0);
if (!ieee80211_sdata_running(sdata))
continue;
+ ieee80211_assign_chanctx(local, sdata);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ break;
+ default:
+ ieee80211_reconfig_stations(sdata);
+ /* fall through */
+ case NL80211_IFTYPE_AP: /* AP stations are handled later */
+ for (i = 0; i < IEEE80211_NUM_ACS; i++)
+ drv_conf_tx(local, sdata, i,
+ &sdata->tx_conf[i]);
+ break;
+ }
+
/* common change flags for all interface types */
changed = BSS_CHANGED_ERP_CTS_PROT |
BSS_CHANGED_ERP_PREAMBLE |
mb();
local->resuming = false;
- /* It's possible that we don't handle the scan completion in
- * time during suspend, so if it's still marked as completed
- * here, queue the work and flush it to clean things up.
- * Instead of calling the worker function directly here, we
- * really queue it to avoid potential races with other flows
- * scheduling the same work.
- */
- if (test_bit(SCAN_COMPLETED, &local->scanning)) {
- ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
- flush_delayed_work(&local->scan_work);
- }
+ ieee80211_flush_completed_scan(local, false);
if (local->open_count && !reconfig_due_to_wowlan)
drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
changed |= IEEE80211_RC_NSS_CHANGED;
}
- if (nss_only)
- return changed;
-
switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
- u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode,
- band, nss_only);
+ u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
if (changed > 0)
rate_control_rate_update(local, sband, sta, changed);
*/
#define MAX_MP_SELECT_LABELS 4
+#define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
+
static int zero = 0;
static int label_limit = (1 << 20) - 1;
}
}
- err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb);
+ /* If via wasn't specified then send out using device address */
+ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
+ err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
+ out_dev->dev_addr, skb);
+ else
+ err = neigh_xmit(nh->nh_via_table, out_dev,
+ mpls_nh_via(rt, nh), skb);
if (err)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
if (!mpls_dev_get(dev))
goto errout;
+ if ((nh->nh_via_table == NEIGH_LINK_TABLE) &&
+ (dev->addr_len != nh->nh_via_alen))
+ goto errout;
+
RCU_INIT_POINTER(nh->nh_dev, dev);
return 0;
goto errout;
}
- err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
- __mpls_nh_via(rt, nh));
- if (err)
- goto errout;
+ if (via) {
+ err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
+ __mpls_nh_via(rt, nh));
+ if (err)
+ goto errout;
+ } else {
+ nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC;
+ }
err = mpls_nh_assign_dev(net, rt, nh, oif);
if (err)
nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
}
- if (!nla_via)
- goto errout;
-
err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
rtnh->rtnh_ifindex, nla_via,
nla_newdst);
cfg->rc_label = LABEL_NOT_SPECIFIED;
cfg->rc_protocol = rtm->rtm_protocol;
+ cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC;
cfg->rc_nlflags = nlh->nlmsg_flags;
cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->rc_nlinfo.nlh = nlh;
nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
nh->nh_label))
goto nla_put_failure;
- if (nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh),
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh),
nh->nh_via_alen))
goto nla_put_failure;
dev = rtnl_dereference(nh->nh_dev);
nh->nh_labels,
nh->nh_label))
goto nla_put_failure;
- if (nla_put_via(skb, nh->nh_via_table,
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table,
mpls_nh_via(rt, nh),
nh->nh_via_alen))
goto nla_put_failure;
if (nh->nh_dev)
payload += nla_total_size(4); /* RTA_OIF */
- payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */
+ payload += nla_total_size(2 + nh->nh_via_alen);
if (nh->nh_labels) /* RTA_NEWDST */
payload += nla_total_size(nh->nh_labels * 4);
} else {
for_nexthops(rt) {
nhsize += nla_total_size(sizeof(struct rtnexthop));
- nhsize += nla_total_size(2 + nh->nh_via_alen);
+ /* RTA_VIA */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC)
+ nhsize += nla_total_size(2 + nh->nh_via_alen);
if (nh->nh_labels)
nhsize += nla_total_size(nh->nh_labels * 4);
} endfor_nexthops(rt);
unsigned int ttl;
/* Obtain the ttl */
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (dst->ops->family == AF_INET) {
ttl = ip_hdr(skb)->ttl;
rt = (struct rtable *)dst;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (dst->ops->family == AF_INET6) {
ttl = ipv6_hdr(skb)->hop_limit;
rt6 = (struct rt6_info *)dst;
} else {
}
static void nft_ctx_init(struct nft_ctx *ctx,
+ struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct nft_af_info *afi,
struct nft_chain *chain,
const struct nlattr * const *nla)
{
- ctx->net = sock_net(skb->sk);
+ ctx->net = net;
ctx->afi = afi;
ctx->table = table;
ctx->chain = chain;
return ret;
}
-static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newtable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nlattr *name;
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
u32 flags = 0;
struct nft_ctx ctx;
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
return nf_tables_updtable(&ctx);
}
INIT_LIST_HEAD(&table->sets);
table->flags = flags;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err3;
return err;
}
-static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_deltable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla);
if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
return nft_flush(&ctx, family);
}
}
-static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
struct nlattr *ha[NFTA_HOOK_MAX + 1];
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
return PTR_ERR(stats);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
sizeof(struct nft_trans_chain));
if (trans == NULL) {
if (err < 0)
goto err1;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
if (err < 0)
goto err2;
return err;
}
-static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
if (chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
return nft_delchain(&ctx);
}
static struct nft_expr_info *info;
-static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule, *old_rule = NULL;
return PTR_ERR(old_rule);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
n = 0;
size = 0;
return err;
}
-static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain = NULL;
struct nft_rule *rule;
return PTR_ERR(chain);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi = NULL;
struct nft_table *table = NULL;
return -ENOENT;
}
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
struct sk_buff *skb2;
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
return 0;
}
-static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nft_set_ops *ops;
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
if (IS_ERR(table))
return PTR_ERR(table);
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
if (IS_ERR(set)) {
nft_set_destroy(set);
}
-static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
if (IS_ERR(afi))
if (!trans && (table->flags & NFT_TABLE_INACTIVE))
return -ENOENT;
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
if (err < 0)
return err;
- err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
- false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
+ (void *)nla, false);
if (err < 0)
return err;
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_ctx ctx;
int err;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
return err;
}
-static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true);
if (err < 0)
return err;
return err;
}
-static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nlattr *attr;
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false);
if (err < 0)
return err;
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list) {
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
{
struct nft_pktinfo pkt;
- switch (eth_hdr(skb)->h_proto) {
+ switch (skb->protocol) {
case htons(ETH_P_IP):
nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
break;
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM);
- skb->sk = oskb->sk;
-
nfnl_lock(subsys_id);
ss = rcu_dereference_protected(table[subsys_id].subsys,
lockdep_is_held(&table[subsys_id].mutex));
goto ack;
if (nc->call_batch) {
- err = nc->call_batch(net->nfnl, skb, nlh,
+ err = nc->call_batch(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
}
break;
}
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (queue->flags & NFQA_CFG_F_CONNTRACK) {
- nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL) {
ct = nfnl_ct->get_ct(entskb, &ctinfo);
if (ct != NULL)
if (entry == NULL)
return -ENOENT;
+ /* rcu lock already held from nfnl->call_rcu. */
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (nfqa[NFQA_CT]) {
- /* rcu lock already held from nfnl->call_rcu. */
- nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL)
ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
return status;
}
goto nla_put_failure;
switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
struct md_labels labels;
};
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
+
static u16 key_to_nfproto(const struct sw_flow_key *key)
{
switch (ntohs(key->eth.type)) {
* previously sent the packet to conntrack via the ct action.
*/
static void ovs_ct_update_key(const struct sk_buff *skb,
+ const struct ovs_conntrack_info *info,
struct sw_flow_key *key, bool post_ct)
{
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
zone = nf_ct_zone(ct);
} else if (post_ct) {
state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
+ if (info)
+ zone = &info->zone;
}
__ovs_ct_update_key(key, state, zone, ct);
}
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
{
- ovs_ct_update_key(skb, key, false);
+ ovs_ct_update_key(skb, NULL, key, false);
}
int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
}
}
- ovs_ct_update_key(skb, key, true);
+ ovs_ct_update_key(skb, info, key, true);
return 0;
}
OVS_NLERR(log, "Failed to allocate conntrack template");
return -ENOMEM;
}
+
+ __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+ nf_conntrack_get(&ct_info.ct->ct_general);
+
if (helper) {
err = ovs_ct_add_helper(&ct_info, helper, key, log);
if (err)
if (err)
goto err_free_ct;
- __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
- nf_conntrack_get(&ct_info.ct->ct_general);
return 0;
err_free_ct:
- nf_conntrack_free(ct_info.ct);
+ __ovs_ct_free_action(&ct_info);
return err;
}
{
struct ovs_conntrack_info *ct_info = nla_data(a);
+ __ovs_ct_free_action(ct_info);
+}
+
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
+{
if (ct_info->helper)
module_put(ct_info->helper->me);
if (ct_info->ct)
if (!start)
return -EMSGSIZE;
- err = ovs_nla_put_tunnel_info(skb, tun_info);
+ err = ip_tun_to_nlattr(skb, &tun_info->key,
+ ip_tunnel_info_opts(tun_info),
+ tun_info->options_len,
+ ip_tunnel_info_af(tun_info));
if (err)
return err;
nla_nest_end(skb, start);
struct rfkill {
spinlock_t lock;
- const char *name;
enum rfkill_type type;
unsigned long state;
struct delayed_work poll_work;
struct work_struct uevent_work;
struct work_struct sync_work;
+ char name[];
};
#define to_rfkill(d) container_of(d, struct rfkill, dev)
if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
return NULL;
- rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+ rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL);
if (!rfkill)
return NULL;
spin_lock_init(&rfkill->lock);
INIT_LIST_HEAD(&rfkill->node);
rfkill->type = type;
- rfkill->name = name;
+ strcpy(rfkill->name, name);
rfkill->ops = ops;
rfkill->data = ops_data;
}
lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
if (!netif_is_multiqueue(dev))
- sch->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
+ sch->flags |= TCQ_F_ONETXQUEUE;
}
sch->handle = handle;
{
struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
- if (qdisc_is_percpu_stats(qdisc))
+ if (qdisc_is_percpu_stats(qdisc)) {
free_percpu(qdisc->cpu_bstats);
+ free_percpu(qdisc->cpu_qstats);
+ }
kfree((char *) qdisc - qdisc->padded);
}
}
}
}
- rcu_read_unlock();
-
if (baddr) {
fl6->saddr = baddr->v6.sin6_addr;
fl6->fl6_sport = baddr->v6.sin6_port;
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
}
+ rcu_read_unlock();
out:
if (!IS_ERR_OR_NULL(dst)) {
struct sock *newsk;
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct sctp6_sock *newsctp6sk;
+ struct ipv6_txoptions *opt;
newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
if (!newsk)
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt)
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ rcu_read_unlock();
+
/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
* and getpeername().
*/
sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
"illegal chunk");
+ sctp_chunk_hold(chunk);
sctp_outq_tail_data(q, chunk);
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
*/
sack_a_rwnd = ntohl(sack->a_rwnd);
+ asoc->peer.zero_window_announced = !sack_a_rwnd;
outstanding = q->outstanding_bytes;
if (outstanding < sack_a_rwnd)
/* Set an expiration time for the cookie. */
cookie->c.expiration = ktime_add(asoc->cookie_life,
- ktime_get());
+ ktime_get_real());
/* Copy the peer's init packet. */
memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
kt = skb_get_ktime(skb);
else
- kt = ktime_get();
+ kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
/*
retval = SCTP_DISPOSITION_CONSUME;
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
/* Even if we can't send the ABORT due to low memory delete the
* TCB. This is a departure from our typical NOMEM handling.
SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
retval = SCTP_DISPOSITION_CONSUME;
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
SCTP_STATE(SCTP_STATE_CLOSED));
SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS);
if (asoc->overall_error_count >= asoc->max_retrans) {
- if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
+ if (asoc->peer.zero_window_announced &&
+ asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
/*
* We are here likely because the receiver had its rwnd
* closed for a while and we have not been able to
int addrs_size,
sctp_assoc_t *assoc_id)
{
- int err = 0;
struct sockaddr *kaddrs;
+ gfp_t gfp = GFP_KERNEL;
+ int err = 0;
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
__func__, sk, addrs, addrs_size);
return -EFAULT;
/* Alloc space for the address array in kernel memory. */
- kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ if (sk->sk_socket->file)
+ gfp = GFP_USER | __GFP_NOWARN;
+ kaddrs = kmalloc(addrs_size, gfp);
if (unlikely(!kaddrs))
return -ENOMEM;
struct sctp_chunk *chunk;
chunk = sctp_make_abort_user(asoc, NULL, 0);
- if (chunk)
- sctp_primitive_ABORT(net, asoc, chunk);
+ sctp_primitive_ABORT(net, asoc, chunk);
} else
sctp_primitive_SHUTDOWN(net, asoc, NULL);
}
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
- sctp_chunk_hold(chunk);
-
/* Do accounting for the write space. */
sctp_set_owner_w(chunk);
* breaks.
*/
err = sctp_primitive_SEND(net, asoc, datamsg);
+ sctp_datamsg_put(datamsg);
/* Did the lower layer accept the chunk? */
- if (err) {
- sctp_datamsg_free(datamsg);
+ if (err)
goto out_free;
- }
pr_debug("%s: we sent primitively\n", __func__);
- sctp_datamsg_put(datamsg);
err = msg_len;
if (unlikely(wait_connect)) {
len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num;
- ids = kmalloc(len, GFP_KERNEL);
+ ids = kmalloc(len, GFP_USER | __GFP_NOWARN);
if (unlikely(!ids))
return -ENOMEM;
newsk->sk_type = sk->sk_type;
newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
newsk->sk_flags = sk->sk_flags;
+ newsk->sk_tsflags = sk->sk_tsflags;
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
newinet->mc_ttl = 1;
newinet->mc_index = 0;
newinet->mc_list = NULL;
+
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
+
+ security_sk_clone(sk, newsk);
}
static inline void sctp_copy_descendant(struct sock *sk_to,
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
+ wq->flags = 0;
RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
return NULL;
}
-static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
+ struct path *res)
{
- struct dentry *dentry;
- struct path path;
- int err = 0;
- /*
- * Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- return err;
+ int err;
- /*
- * All right, let's create it.
- */
- err = security_path_mknod(&path, dentry, mode, 0);
+ err = security_path_mknod(path, dentry, mode, 0);
if (!err) {
- err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+ err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
if (!err) {
- res->mnt = mntget(path.mnt);
+ res->mnt = mntget(path->mnt);
res->dentry = dget(dentry);
}
}
- done_path_create(&path, dentry);
+
return err;
}
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- int err;
+ int err, name_err;
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
+ struct path path;
+ struct dentry *dentry;
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
goto out;
addr_len = err;
+ name_err = 0;
+ dentry = NULL;
+ if (sun_path[0]) {
+ /* Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+
+ if (IS_ERR(dentry)) {
+ /* delay report until after 'already bound' check */
+ name_err = PTR_ERR(dentry);
+ dentry = NULL;
+ }
+ }
+
err = mutex_lock_interruptible(&u->readlock);
if (err)
- goto out;
+ goto out_path;
err = -EINVAL;
if (u->addr)
goto out_up;
+ if (name_err) {
+ err = name_err == -EEXIST ? -EADDRINUSE : name_err;
+ goto out_up;
+ }
+
err = -ENOMEM;
addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
if (!addr)
addr->hash = hash ^ sk->sk_type;
atomic_set(&addr->refcnt, 1);
- if (sun_path[0]) {
- struct path path;
+ if (dentry) {
+ struct path u_path;
umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(sun_path, mode, &path);
+ err = unix_mknod(dentry, &path, mode, &u_path);
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
+ hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
- u->path = path;
+ u->path = u_path;
list = &unix_socket_table[hash];
} else {
spin_lock(&unix_table_lock);
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->readlock);
+out_path:
+ if (dentry)
+ done_path_create(&path, dentry);
+
out:
return err;
}
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
- err = mutex_lock_interruptible(&u->readlock);
- if (unlikely(err)) {
- /* recvmsg() in non blocking mode is supposed to return -EAGAIN
- * sk_rcvtimeo is not honored by mutex_lock_interruptible()
- */
- err = noblock ? -EAGAIN : -ERESTARTSYS;
- goto out;
- }
+ mutex_lock(&u->readlock);
if (flags & MSG_PEEK)
skip = sk_peek_offset(sk, flags);
timeo = unix_stream_data_wait(sk, timeo, last,
last_len);
- if (signal_pending(current) ||
- mutex_lock_interruptible(&u->readlock)) {
+ if (signal_pending(current)) {
err = sock_intr_errno(timeo);
goto out;
}
+ mutex_lock(&u->readlock);
continue;
unlock:
unix_state_unlock(sk);
if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
if (!(rdev->wiphy.features &
NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
- !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+ kzfree(connkeys);
return -EINVAL;
+ }
connect.flags |= ASSOC_REQ_USE_RRM;
}
if (new_triggers.tcp && new_triggers.tcp->sock)
sock_release(new_triggers.tcp->sock);
kfree(new_triggers.tcp);
+ kfree(new_triggers.nd_config);
return err;
}
#endif
break;
default:
WARN(1, "invalid initiator %d\n", lr->initiator);
+ kfree(rd);
return -EINVAL;
}
/* We always try to get an update for the static regdomain */
err = regulatory_hint_core(cfg80211_world_regdom->alpha2);
if (err) {
- if (err == -ENOMEM)
+ if (err == -ENOMEM) {
+ platform_device_unregister(reg_pdev);
return err;
+ }
/*
* N.B. kobject_uevent_env() can fail mainly for when we're out
* memory which is handled and propagated appropriately above
}
EXPORT_SYMBOL(xfrm_policy_alloc);
+static void xfrm_policy_destroy_rcu(struct rcu_head *head)
+{
+ struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);
+
+ security_xfrm_policy_free(policy->security);
+ kfree(policy);
+}
+
/* Destroy xfrm_policy: descendant resources must be released to this moment. */
void xfrm_policy_destroy(struct xfrm_policy *policy)
if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
BUG();
- security_xfrm_policy_free(policy->security);
- kfree(policy);
+ call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);
struct xfrm_policy *pol;
struct net *net = sock_net(sk);
+ rcu_read_lock();
read_lock_bh(&net->xfrm.xfrm_policy_lock);
- if ((pol = sk->sk_policy[dir]) != NULL) {
+ pol = rcu_dereference(sk->sk_policy[dir]);
+ if (pol != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
int err = 0;
}
out:
read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ rcu_read_unlock();
return pol;
}
#endif
write_lock_bh(&net->xfrm.xfrm_policy_lock);
- old_pol = sk->sk_policy[dir];
- sk->sk_policy[dir] = pol;
+ old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
pol->curlft.add_time = get_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
+ rcu_assign_pointer(sk->sk_policy[dir], pol);
if (old_pol) {
if (pol)
xfrm_policy_requeue(old_pol, pol);
return newp;
}
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
- struct xfrm_policy *p0 = sk->sk_policy[0],
- *p1 = sk->sk_policy[1];
+ const struct xfrm_policy *p;
+ struct xfrm_policy *np;
+ int i, ret = 0;
- sk->sk_policy[0] = sk->sk_policy[1] = NULL;
- if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
- return -ENOMEM;
- if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
- return -ENOMEM;
- return 0;
+ rcu_read_lock();
+ for (i = 0; i < 2; i++) {
+ p = rcu_dereference(osk->sk_policy[i]);
+ if (p) {
+ np = clone_policy(p, i);
+ if (unlikely(!np)) {
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(sk->sk_policy[i], np);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
static int
xdst = NULL;
route = NULL;
+ sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
}
pol = NULL;
+ sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
pol = xfrm_sk_policy_lookup(sk, dir, &fl);
if (IS_ERR(pol)) {
int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
{
- struct net *net;
int err = 0;
if (unlikely(afinfo == NULL))
return -EINVAL;
}
spin_unlock(&xfrm_policy_afinfo_lock);
- rtnl_lock();
- for_each_net(net) {
- struct dst_ops *xfrm_dst_ops;
-
- switch (afinfo->family) {
- case AF_INET:
- xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
- break;
-#if IS_ENABLED(CONFIG_IPV6)
- case AF_INET6:
- xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
- break;
-#endif
- default:
- BUG();
- }
- *xfrm_dst_ops = *afinfo->dst_ops;
- }
- rtnl_unlock();
-
return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
-static void __net_init xfrm_dst_ops_init(struct net *net)
-{
- struct xfrm_policy_afinfo *afinfo;
-
- rcu_read_lock();
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]);
- if (afinfo)
- net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
-#if IS_ENABLED(CONFIG_IPV6)
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]);
- if (afinfo)
- net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
-#endif
- rcu_read_unlock();
-}
-
static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
rv = xfrm_policy_init(net);
if (rv < 0)
goto out_policy;
- xfrm_dst_ops_init(net);
rv = xfrm_sysctl_init(net);
if (rv < 0)
goto out_sysctl;
static int fd_map; /* File descriptor for file being modified. */
static int mmap_failed; /* Boolean flag. */
-static void *ehdr_curr; /* current ElfXX_Ehdr * for resource cleanup */
static char gpfx; /* prefix for global symbol name (sometimes '_') */
static struct stat sb; /* Remember .st_size, etc. */
static jmp_buf jmpenv; /* setjmp/longjmp per-file error escape */
static const char *altmcount; /* alternate mcount symbol name */
static int warn_on_notrace_sect; /* warn when section has mcount not being recorded */
+static void *file_map; /* pointer of the mapped file */
+static void *file_end; /* pointer to the end of the mapped file */
+static int file_updated; /* flag to state file was changed */
+static void *file_ptr; /* current file pointer location */
+static void *file_append; /* added to the end of the file */
+static size_t file_append_size; /* how much is added to end of file */
/* setjmp() return values */
enum {
cleanup(void)
{
if (!mmap_failed)
- munmap(ehdr_curr, sb.st_size);
+ munmap(file_map, sb.st_size);
else
- free(ehdr_curr);
- close(fd_map);
+ free(file_map);
+ file_map = NULL;
+ free(file_append);
+ file_append = NULL;
+ file_append_size = 0;
+ file_updated = 0;
}
static void __attribute__((noreturn))
static off_t
ulseek(int const fd, off_t const offset, int const whence)
{
- off_t const w = lseek(fd, offset, whence);
- if (w == (off_t)-1) {
- perror("lseek");
+ switch (whence) {
+ case SEEK_SET:
+ file_ptr = file_map + offset;
+ break;
+ case SEEK_CUR:
+ file_ptr += offset;
+ break;
+ case SEEK_END:
+ file_ptr = file_map + (sb.st_size - offset);
+ break;
+ }
+ if (file_ptr < file_map) {
+ fprintf(stderr, "lseek: seek before file\n");
fail_file();
}
- return w;
+ return file_ptr - file_map;
}
static size_t
static size_t
uwrite(int const fd, void const *const buf, size_t const count)
{
- size_t const n = write(fd, buf, count);
- if (n != count) {
- perror("write");
- fail_file();
+ size_t cnt = count;
+ off_t idx = 0;
+
+ file_updated = 1;
+
+ if (file_ptr + count >= file_end) {
+ off_t aoffset = (file_ptr + count) - file_end;
+
+ if (aoffset > file_append_size) {
+ file_append = realloc(file_append, aoffset);
+ file_append_size = aoffset;
+ }
+ if (!file_append) {
+ perror("write");
+ fail_file();
+ }
+ if (file_ptr < file_end) {
+ cnt = file_end - file_ptr;
+ } else {
+ cnt = 0;
+ idx = aoffset - count;
+ }
}
- return n;
+
+ if (cnt)
+ memcpy(file_ptr, buf, cnt);
+
+ if (cnt < count)
+ memcpy(file_append + idx, buf + cnt, count - cnt);
+
+ file_ptr += count;
+ return count;
}
static void *
*/
static void *mmap_file(char const *fname)
{
- void *addr;
-
- fd_map = open(fname, O_RDWR);
+ fd_map = open(fname, O_RDONLY);
if (fd_map < 0 || fstat(fd_map, &sb) < 0) {
perror(fname);
fail_file();
fprintf(stderr, "not a regular file: %s\n", fname);
fail_file();
}
- addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
- fd_map, 0);
+ file_map = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE,
+ fd_map, 0);
mmap_failed = 0;
- if (addr == MAP_FAILED) {
+ if (file_map == MAP_FAILED) {
mmap_failed = 1;
- addr = umalloc(sb.st_size);
- uread(fd_map, addr, sb.st_size);
+ file_map = umalloc(sb.st_size);
+ uread(fd_map, file_map, sb.st_size);
+ }
+ close(fd_map);
+
+ file_end = file_map + sb.st_size;
+
+ return file_map;
+}
+
+static void write_file(const char *fname)
+{
+ char tmp_file[strlen(fname) + 4];
+ size_t n;
+
+ if (!file_updated)
+ return;
+
+ sprintf(tmp_file, "%s.rc", fname);
+
+ /*
+ * After reading the entire file into memory, delete it
+ * and write it back, to prevent weird side effects of modifying
+ * an object file in place.
+ */
+ fd_map = open(tmp_file, O_WRONLY | O_TRUNC | O_CREAT, sb.st_mode);
+ if (fd_map < 0) {
+ perror(fname);
+ fail_file();
+ }
+ n = write(fd_map, file_map, sb.st_size);
+ if (n != sb.st_size) {
+ perror("write");
+ fail_file();
+ }
+ if (file_append_size) {
+ n = write(fd_map, file_append, file_append_size);
+ if (n != file_append_size) {
+ perror("write");
+ fail_file();
+ }
+ }
+ close(fd_map);
+ if (rename(tmp_file, fname) < 0) {
+ perror(fname);
+ fail_file();
}
- return addr;
}
/* w8rev, w8nat, ...: Handle endianness. */
Elf32_Ehdr *const ehdr = mmap_file(fname);
unsigned int reltype = 0;
- ehdr_curr = ehdr;
w = w4nat;
w2 = w2nat;
w8 = w8nat;
}
} /* end switch */
+ write_file(fname);
cleanup();
}
case SJ_SETJMP: /* normal sequence */
/* Avoid problems if early cleanup() */
fd_map = -1;
- ehdr_curr = NULL;
mmap_failed = 1;
+ file_map = NULL;
+ file_ptr = NULL;
+ file_updated = 0;
do_file(file);
break;
case SJ_FAIL: /* error in do_file or below */
+ fprintf(stderr, "%s: failed\n", file);
++n_error;
break;
case SJ_SUCCEED: /* premature success */
/* the key is probably readable - now try to read it */
can_read_key:
- ret = key_validate(key);
- if (ret == 0) {
- ret = -EOPNOTSUPP;
- if (key->type->read) {
- /* read the data with the semaphore held (since we
- * might sleep) */
- down_read(&key->sem);
+ ret = -EOPNOTSUPP;
+ if (key->type->read) {
+ /* Read the data with the semaphore held (since we might sleep)
+ * to protect against the key being updated or revoked.
+ */
+ down_read(&key->sem);
+ ret = key_validate(key);
+ if (ret == 0)
ret = key->type->read(key, buffer, buflen);
- up_read(&key->sem);
- }
+ up_read(&key->sem);
}
error2:
* @inode: the object
* @buffer: where they go
* @buffer_size: size of buffer
- *
- * Returns 0 on success, -EINVAL otherwise
*/
static int smack_inode_listsecurity(struct inode *inode, char *buffer,
size_t buffer_size)
}
#endif /* CONFIG_PM_SLEEP || SUPPORT_VGA_SWITCHEROO */
+#ifdef CONFIG_PM_SLEEP
+/* put codec down to D3 at hibernation for Intel SKL+;
+ * otherwise BIOS may still access the codec and screw up the driver
+ */
+#define IS_SKL(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0xa170)
+#define IS_SKL_LP(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x9d70)
+#define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98)
+#define IS_SKL_PLUS(pci) (IS_SKL(pci) || IS_SKL_LP(pci) || IS_BXT(pci))
+
+static int azx_freeze_noirq(struct device *dev)
+{
+ struct pci_dev *pci = to_pci_dev(dev);
+
+ if (IS_SKL_PLUS(pci))
+ pci_set_power_state(pci, PCI_D3hot);
+
+ return 0;
+}
+
+static int azx_thaw_noirq(struct device *dev)
+{
+ struct pci_dev *pci = to_pci_dev(dev);
+
+ if (IS_SKL_PLUS(pci))
+ pci_set_power_state(pci, PCI_D0);
+
+ return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
#ifdef CONFIG_PM
static int azx_runtime_suspend(struct device *dev)
{
static const struct dev_pm_ops azx_pm = {
SET_SYSTEM_SLEEP_PM_OPS(azx_suspend, azx_resume)
+#ifdef CONFIG_PM_SLEEP
+ .freeze_noirq = azx_freeze_noirq,
+ .thaw_noirq = azx_thaw_noirq,
+#endif
SET_RUNTIME_PM_OPS(azx_runtime_suspend, azx_runtime_resume, azx_runtime_idle)
};
ALC_HEADSET_TYPE_OMTP,
};
+enum {
+ ALC_KEY_MICMUTE_INDEX,
+};
+
struct alc_customize_define {
unsigned int sku_cfg;
unsigned char port_connectivity;
void (*power_hook)(struct hda_codec *codec);
#endif
void (*shutup)(struct hda_codec *codec);
+ void (*reboot_notify)(struct hda_codec *codec);
int init_amp;
int codec_variant; /* flag for other variants */
unsigned int pll_coef_idx, pll_coef_bit;
unsigned int coef0;
struct input_dev *kb_dev;
+ u8 alc_mute_keycode_map[1];
};
/*
snd_hda_shutup_pins(codec);
}
+static void alc_reboot_notify(struct hda_codec *codec)
+{
+ struct alc_spec *spec = codec->spec;
+
+ if (spec && spec->reboot_notify)
+ spec->reboot_notify(codec);
+ else
+ alc_shutup(codec);
+}
+
+/* power down codec to D3 at reboot/shutdown; set as reboot_notify ops */
+static void alc_d3_at_reboot(struct hda_codec *codec)
+{
+ snd_hda_codec_set_power_to_all(codec, codec->core.afg, AC_PWRST_D3);
+ snd_hda_codec_write(codec, codec->core.afg, 0,
+ AC_VERB_SET_POWER_STATE, AC_PWRST_D3);
+ msleep(10);
+}
+
#define alc_free snd_hda_gen_free
#ifdef CONFIG_PM
.suspend = alc_suspend,
.check_power_status = snd_hda_gen_check_power_status,
#endif
- .reboot_notify = alc_shutup,
+ .reboot_notify = alc_reboot_notify,
};
ALC889_FIXUP_MBA11_VREF,
ALC889_FIXUP_MBA21_VREF,
ALC889_FIXUP_MP11_VREF,
+ ALC889_FIXUP_MP41_VREF,
ALC882_FIXUP_INV_DMIC,
ALC882_FIXUP_NO_PRIMARY_HP,
ALC887_FIXUP_ASUS_BASS,
const struct hda_fixup *fix, int action)
{
struct alc_spec *spec = codec->spec;
- static hda_nid_t nids[2] = { 0x14, 0x15 };
+ static hda_nid_t nids[3] = { 0x14, 0x15, 0x19 };
int i;
if (action != HDA_FIXUP_ACT_INIT)
.chained = true,
.chain_id = ALC885_FIXUP_MACPRO_GPIO,
},
+ [ALC889_FIXUP_MP41_VREF] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc889_fixup_mbp_vref,
+ .chained = true,
+ .chain_id = ALC885_FIXUP_MACPRO_GPIO,
+ },
[ALC882_FIXUP_INV_DMIC] = {
.type = HDA_FIXUP_FUNC,
.v.func = alc_fixup_inv_dmic,
SND_PCI_QUIRK(0x106b, 0x3f00, "Macbook 5,1", ALC889_FIXUP_IMAC91_VREF),
SND_PCI_QUIRK(0x106b, 0x4000, "MacbookPro 5,1", ALC889_FIXUP_IMAC91_VREF),
SND_PCI_QUIRK(0x106b, 0x4100, "Macmini 3,1", ALC889_FIXUP_IMAC91_VREF),
- SND_PCI_QUIRK(0x106b, 0x4200, "Mac Pro 5,1", ALC885_FIXUP_MACPRO_GPIO),
+ SND_PCI_QUIRK(0x106b, 0x4200, "Mac Pro 4,1/5,1", ALC889_FIXUP_MP41_VREF),
SND_PCI_QUIRK(0x106b, 0x4300, "iMac 9,1", ALC889_FIXUP_IMAC91_VREF),
SND_PCI_QUIRK(0x106b, 0x4600, "MacbookPro 5,2", ALC889_FIXUP_IMAC91_VREF),
SND_PCI_QUIRK(0x106b, 0x4900, "iMac 9,1 Aluminum", ALC889_FIXUP_IMAC91_VREF),
/* GPIO2 just toggles on a keypress/keyrelease cycle. Therefore
send both key on and key off event for every interrupt. */
- input_report_key(spec->kb_dev, KEY_MICMUTE, 1);
+ input_report_key(spec->kb_dev, spec->alc_mute_keycode_map[ALC_KEY_MICMUTE_INDEX], 1);
input_sync(spec->kb_dev);
- input_report_key(spec->kb_dev, KEY_MICMUTE, 0);
+ input_report_key(spec->kb_dev, spec->alc_mute_keycode_map[ALC_KEY_MICMUTE_INDEX], 0);
input_sync(spec->kb_dev);
}
+static int alc_register_micmute_input_device(struct hda_codec *codec)
+{
+ struct alc_spec *spec = codec->spec;
+ int i;
+
+ spec->kb_dev = input_allocate_device();
+ if (!spec->kb_dev) {
+ codec_err(codec, "Out of memory (input_allocate_device)\n");
+ return -ENOMEM;
+ }
+
+ spec->alc_mute_keycode_map[ALC_KEY_MICMUTE_INDEX] = KEY_MICMUTE;
+
+ spec->kb_dev->name = "Microphone Mute Button";
+ spec->kb_dev->evbit[0] = BIT_MASK(EV_KEY);
+ spec->kb_dev->keycodesize = sizeof(spec->alc_mute_keycode_map[0]);
+ spec->kb_dev->keycodemax = ARRAY_SIZE(spec->alc_mute_keycode_map);
+ spec->kb_dev->keycode = spec->alc_mute_keycode_map;
+ for (i = 0; i < ARRAY_SIZE(spec->alc_mute_keycode_map); i++)
+ set_bit(spec->alc_mute_keycode_map[i], spec->kb_dev->keybit);
+
+ if (input_register_device(spec->kb_dev)) {
+ codec_err(codec, "input_register_device failed\n");
+ input_free_device(spec->kb_dev);
+ spec->kb_dev = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static void alc280_fixup_hp_gpio2_mic_hotkey(struct hda_codec *codec,
const struct hda_fixup *fix, int action)
{
struct alc_spec *spec = codec->spec;
if (action == HDA_FIXUP_ACT_PRE_PROBE) {
- spec->kb_dev = input_allocate_device();
- if (!spec->kb_dev) {
- codec_err(codec, "Out of memory (input_allocate_device)\n");
+ if (alc_register_micmute_input_device(codec) != 0)
return;
- }
- spec->kb_dev->name = "Microphone Mute Button";
- spec->kb_dev->evbit[0] = BIT_MASK(EV_KEY);
- spec->kb_dev->keybit[BIT_WORD(KEY_MICMUTE)] = BIT_MASK(KEY_MICMUTE);
- if (input_register_device(spec->kb_dev)) {
- codec_err(codec, "input_register_device failed\n");
- input_free_device(spec->kb_dev);
- spec->kb_dev = NULL;
- return;
- }
snd_hda_add_verbs(codec, gpio_init);
snd_hda_codec_write_cache(codec, codec->core.afg, 0,
}
}
+static void alc233_fixup_lenovo_line2_mic_hotkey(struct hda_codec *codec,
+ const struct hda_fixup *fix, int action)
+{
+ /* Line2 = mic mute hotkey
+ GPIO2 = mic mute LED */
+ static const struct hda_verb gpio_init[] = {
+ { 0x01, AC_VERB_SET_GPIO_MASK, 0x04 },
+ { 0x01, AC_VERB_SET_GPIO_DIRECTION, 0x04 },
+ {}
+ };
+
+ struct alc_spec *spec = codec->spec;
+
+ if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ if (alc_register_micmute_input_device(codec) != 0)
+ return;
+
+ snd_hda_add_verbs(codec, gpio_init);
+ snd_hda_jack_detect_enable_callback(codec, 0x1b,
+ gpio2_mic_hotkey_event);
+
+ spec->gen.cap_sync_hook = alc_fixup_gpio_mic_mute_hook;
+ spec->gpio_led = 0;
+ spec->mute_led_polarity = 0;
+ spec->gpio_mic_led_mask = 0x04;
+ return;
+ }
+
+ if (!spec->kb_dev)
+ return;
+
+ switch (action) {
+ case HDA_FIXUP_ACT_PROBE:
+ spec->init_amp = ALC_INIT_DEFAULT;
+ break;
+ case HDA_FIXUP_ACT_FREE:
+ input_unregister_device(spec->kb_dev);
+ spec->kb_dev = NULL;
+ }
+}
+
static void alc269_fixup_hp_line1_mic1_led(struct hda_codec *codec,
const struct hda_fixup *fix, int action)
{
struct alc_spec *spec = codec->spec;
if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+ spec->shutup = alc_no_shutup; /* reduce click noise */
+ spec->reboot_notify = alc_d3_at_reboot; /* reduce noise */
spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP;
codec->power_save_node = 0; /* avoid click noises */
snd_hda_apply_pincfgs(codec, pincfgs);
}
}
-/* additional fixup for Thinkpad T440s noise problem */
-static void alc_fixup_tpt440(struct hda_codec *codec,
- const struct hda_fixup *fix, int action)
-{
- struct alc_spec *spec = codec->spec;
-
- if (action == HDA_FIXUP_ACT_PRE_PROBE) {
- spec->shutup = alc_no_shutup; /* reduce click noise */
- spec->gen.mixer_nid = 0; /* reduce background noise */
- }
-}
-
static void alc_shutup_dell_xps13(struct hda_codec *codec)
{
struct alc_spec *spec = codec->spec;
ALC288_FIXUP_DISABLE_AAMIX,
ALC292_FIXUP_DELL_E7X,
ALC292_FIXUP_DISABLE_AAMIX,
+ ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK,
ALC298_FIXUP_DELL1_MIC_NO_PRESENCE,
ALC275_FIXUP_DELL_XPS,
ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE,
ALC293_FIXUP_LENOVO_SPK_NOISE,
+ ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY,
};
static const struct hda_fixup alc269_fixups[] = {
},
[ALC292_FIXUP_TPT440] = {
.type = HDA_FIXUP_FUNC,
- .v.func = alc_fixup_tpt440,
+ .v.func = alc_fixup_disable_aamix,
.chained = true,
.chain_id = ALC292_FIXUP_TPT440_DOCK,
},
.chained = true,
.chain_id = ALC269_FIXUP_DELL2_MIC_NO_PRESENCE
},
+ [ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc_fixup_disable_aamix,
+ .chained = true,
+ .chain_id = ALC293_FIXUP_DELL1_MIC_NO_PRESENCE
+ },
[ALC292_FIXUP_DELL_E7X] = {
.type = HDA_FIXUP_FUNC,
.v.func = alc_fixup_dell_xps13,
.chained = true,
.chain_id = ALC269_FIXUP_THINKPAD_ACPI
},
+ [ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY] = {
+ .type = HDA_FIXUP_FUNC,
+ .v.func = alc233_fixup_lenovo_line2_mic_hotkey,
+ },
};
static const struct snd_pci_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x1028, 0x06c7, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1028, 0x06d9, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1028, 0x06da, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
- SND_PCI_QUIRK(0x1028, 0x06db, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
- SND_PCI_QUIRK(0x1028, 0x06dd, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
- SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
- SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
- SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC292_FIXUP_DISABLE_AAMIX),
+ SND_PCI_QUIRK(0x1028, 0x06db, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
+ SND_PCI_QUIRK(0x1028, 0x06dd, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
+ SND_PCI_QUIRK(0x1028, 0x06de, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
+ SND_PCI_QUIRK(0x1028, 0x06df, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
+ SND_PCI_QUIRK(0x1028, 0x06e0, "Dell", ALC293_FIXUP_DISABLE_AAMIX_MULTIJACK),
SND_PCI_QUIRK(0x1028, 0x0704, "Dell XPS 13", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
SND_PCI_QUIRK(0x17aa, 0x2212, "Thinkpad T440", ALC292_FIXUP_TPT440_DOCK),
SND_PCI_QUIRK(0x17aa, 0x2214, "Thinkpad X240", ALC292_FIXUP_TPT440_DOCK),
SND_PCI_QUIRK(0x17aa, 0x2215, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
+ SND_PCI_QUIRK(0x17aa, 0x2218, "Thinkpad X1 Carbon 2nd", ALC292_FIXUP_TPT440_DOCK),
SND_PCI_QUIRK(0x17aa, 0x2223, "ThinkPad T550", ALC292_FIXUP_TPT440_DOCK),
SND_PCI_QUIRK(0x17aa, 0x2226, "ThinkPad X250", ALC292_FIXUP_TPT440_DOCK),
SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC293_FIXUP_LENOVO_SPK_NOISE),
+ SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),
SND_PCI_QUIRK(0x17aa, 0x3978, "IdeaPad Y410P", ALC269_FIXUP_NO_SHUTUP),
SND_PCI_QUIRK(0x17aa, 0x5013, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
bool reconfig;
unsigned int aif_tx_state, aif_rx_state;
- if (params_rate(params) % 8000)
+ if (params_rate(params) % 4000)
rates = &arizona_44k1_bclk_rates[0];
else
rates = &arizona_48k_bclk_rates[0];
static const DECLARE_TLV_DB_SCALE(bypass_tlv, -1500, 300, 0);
static const DECLARE_TLV_DB_SCALE(mic_tlv, 0, 300, 0);
-static const int deemph_settings[] = { 0, 32000, 44100, 48000 };
+static const struct {
+ int rate;
+ unsigned int val;
+} deemph_settings[] = {
+ { 0, ES8328_DACCONTROL6_DEEMPH_OFF },
+ { 32000, ES8328_DACCONTROL6_DEEMPH_32k },
+ { 44100, ES8328_DACCONTROL6_DEEMPH_44_1k },
+ { 48000, ES8328_DACCONTROL6_DEEMPH_48k },
+};
static int es8328_set_deemph(struct snd_soc_codec *codec)
{
* rate.
*/
if (es8328->deemph) {
- best = 1;
- for (i = 2; i < ARRAY_SIZE(deemph_settings); i++) {
- if (abs(deemph_settings[i] - es8328->playback_fs) <
- abs(deemph_settings[best] - es8328->playback_fs))
+ best = 0;
+ for (i = 1; i < ARRAY_SIZE(deemph_settings); i++) {
+ if (abs(deemph_settings[i].rate - es8328->playback_fs) <
+ abs(deemph_settings[best].rate - es8328->playback_fs))
best = i;
}
- val = best << 1;
+ val = deemph_settings[best].val;
} else {
- val = 0;
+ val = ES8328_DACCONTROL6_DEEMPH_OFF;
}
dev_dbg(codec->dev, "Set deemphasis %d\n", val);
- return snd_soc_update_bits(codec, ES8328_DACCONTROL6, 0x6, val);
+ return snd_soc_update_bits(codec, ES8328_DACCONTROL6,
+ ES8328_DACCONTROL6_DEEMPH_MASK, val);
}
static int es8328_get_deemph(struct snd_kcontrol *kcontrol,
#define ES8328_DACCONTROL6_CLICKFREE (1 << 3)
#define ES8328_DACCONTROL6_DAC_INVR (1 << 4)
#define ES8328_DACCONTROL6_DAC_INVL (1 << 5)
+#define ES8328_DACCONTROL6_DEEMPH_MASK (3 << 6)
#define ES8328_DACCONTROL6_DEEMPH_OFF (0 << 6)
#define ES8328_DACCONTROL6_DEEMPH_32k (1 << 6)
#define ES8328_DACCONTROL6_DEEMPH_44_1k (2 << 6)
RT5645_PWR_CLS_D_L,
RT5645_PWR_CLS_D | RT5645_PWR_CLS_D_R |
RT5645_PWR_CLS_D_L);
+ snd_soc_update_bits(codec, RT5645_GEN_CTRL3,
+ RT5645_DET_CLK_MASK, RT5645_DET_CLK_MODE1);
break;
case SND_SOC_DAPM_PRE_PMD:
+ snd_soc_update_bits(codec, RT5645_GEN_CTRL3,
+ RT5645_DET_CLK_MASK, RT5645_DET_CLK_DIS);
snd_soc_write(codec, RT5645_EQ_CTRL2, 0);
snd_soc_update_bits(codec, RT5645_PWR_DIG1,
RT5645_PWR_CLS_D | RT5645_PWR_CLS_D_R |
/* General Control3 (0xfc) */
#define RT5645_JD_PSV_MODE (0x1 << 12)
#define RT5645_IRQ_CLK_GATE_CTRL (0x1 << 11)
+#define RT5645_DET_CLK_MASK (0x3 << 9)
+#define RT5645_DET_CLK_DIS (0x0 << 9)
+#define RT5645_DET_CLK_MODE1 (0x1 << 9)
+#define RT5645_DET_CLK_MODE2 (0x2 << 9)
#define RT5645_MICINDET_MANU (0x1 << 7)
#define RT5645_RING2_SLEEVE_GND (0x1 << 5)
case SND_SOC_DAPM_POST_PMU:
snd_soc_update_bits(codec, SGTL5000_CHIP_ANA_POWER,
SGTL5000_VAG_POWERUP, SGTL5000_VAG_POWERUP);
+ msleep(400);
break;
case SND_SOC_DAPM_PRE_PMD:
.max_register = WM8974_MONOMIX,
.reg_defaults = wm8974_reg_defaults,
.num_reg_defaults = ARRAY_SIZE(wm8974_reg_defaults),
+ .cache_type = REGCACHE_FLAT,
};
static int wm8974_probe(struct snd_soc_codec *codec)
/* wait for XDATA to be cleared */
cnt = 0;
- while (!(mcasp_get_reg(mcasp, DAVINCI_MCASP_TXSTAT_REG) &
- ~XRDATA) && (cnt < 100000))
+ while ((mcasp_get_reg(mcasp, DAVINCI_MCASP_TXSTAT_REG) & XRDATA) &&
+ (cnt < 100000))
cnt++;
/* Release TX state machine */
FSL_SAI_CSR_FR, FSL_SAI_CSR_FR);
regmap_update_bits(sai->regmap, FSL_SAI_RCSR,
FSL_SAI_CSR_FR, FSL_SAI_CSR_FR);
+
+ /*
+ * For sai master mode, after several open/close sai,
+ * there will be no frame clock, and can't recover
+ * anymore. Add software reset to fix this issue.
+ * This is a hardware bug, and will be fix in the
+ * next sai version.
+ */
+ if (!sai->is_slave_mode) {
+ /* Software Reset for both Tx and Rx */
+ regmap_write(sai->regmap,
+ FSL_SAI_TCSR, FSL_SAI_CSR_SR);
+ regmap_write(sai->regmap,
+ FSL_SAI_RCSR, FSL_SAI_CSR_SR);
+ /* Clear SR bit to finish the reset */
+ regmap_write(sai->regmap, FSL_SAI_TCSR, 0);
+ regmap_write(sai->regmap, FSL_SAI_RCSR, 0);
+ }
}
break;
default:
*/
ret = snd_soc_tplg_component_load(&platform->component,
&skl_tplg_ops, fw, 0);
- release_firmware(fw);
if (ret < 0) {
dev_err(bus->dev, "tplg component load failed%d\n", ret);
return -EINVAL;
skl->resource.max_mcps = SKL_MAX_MCPS;
skl->resource.max_mem = SKL_FW_MAX_MEM;
+ skl->tplg = fw;
+
return 0;
}
#include <linux/pci.h>
#include <linux/pm_runtime.h>
#include <linux/platform_device.h>
+#include <linux/firmware.h>
#include <sound/pcm.h>
#include "skl.h"
struct hdac_ext_bus *ebus = pci_get_drvdata(pci);
struct skl *skl = ebus_to_skl(ebus);
+ if (skl->tplg)
+ release_firmware(skl->tplg);
+
if (pci_dev_run_wake(pci))
pm_runtime_get_noresume(&pci->dev);
pci_dev_put(pci);
struct skl_dsp_resource resource;
struct list_head ppl_list;
struct list_head dapm_path_list;
+
+ const struct firmware *tplg;
};
#define skl_to_ebus(s) (&(s)->ebus)
case SNDRV_PCM_TRIGGER_RESUME:
case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
ret = regmap_update_bits(spdif->regmap, SPDIF_DMACR,
- SPDIF_DMACR_TDE_ENABLE,
- SPDIF_DMACR_TDE_ENABLE);
+ SPDIF_DMACR_TDE_ENABLE |
+ SPDIF_DMACR_TDL_MASK,
+ SPDIF_DMACR_TDE_ENABLE |
+ SPDIF_DMACR_TDL(16));
if (ret != 0)
return ret;
#define SPDIF_DMACR_TDL_SHIFT 0
#define SPDIF_DMACR_TDL(x) ((x) << SPDIF_DMACR_TDL_SHIFT)
-#define SPDIF_DMACR_TDL_MASK (0x1f << SDPIF_DMACR_TDL_SHIFT)
+#define SPDIF_DMACR_TDL_MASK (0x1f << SPDIF_DMACR_TDL_SHIFT)
/*
* XFER
}
}
+ snd_usb_mixer_fu_apply_quirk(state->mixer, cval, unitid, kctl);
+
range = (cval->max - cval->min) / cval->res;
/*
* Are there devices with volume range more than 255? I use a bit more
{ 0 } /* terminator */
};
-/* Dragonfly DAC 1.2, the dB conversion factor is 1 instead of 256 */
-static struct usbmix_dB_map dragonfly_1_2_dB = {0, 5000};
-static struct usbmix_name_map dragonfly_1_2_map[] = {
- { 7, NULL, .dB = &dragonfly_1_2_dB },
- { 0 } /* terminator */
-};
-
/*
* Control map entries
*/
.id = USB_ID(0x05a7, 0x1020),
.map = bose_companion5_map,
},
- {
- /* Dragonfly DAC 1.2 */
- .id = USB_ID(0x21b4, 0x0081),
- .map = dragonfly_1_2_map,
- },
{ 0 } /* terminator */
};
#include <sound/control.h>
#include <sound/hwdep.h>
#include <sound/info.h>
+#include <sound/tlv.h>
#include "usbaudio.h"
#include "mixer.h"
}
}
+static void snd_dragonfly_quirk_db_scale(struct usb_mixer_interface *mixer,
+ struct snd_kcontrol *kctl)
+{
+ /* Approximation using 10 ranges based on output measurement on hw v1.2.
+ * This seems close to the cubic mapping e.g. alsamixer uses. */
+ static const DECLARE_TLV_DB_RANGE(scale,
+ 0, 1, TLV_DB_MINMAX_ITEM(-5300, -4970),
+ 2, 5, TLV_DB_MINMAX_ITEM(-4710, -4160),
+ 6, 7, TLV_DB_MINMAX_ITEM(-3884, -3710),
+ 8, 14, TLV_DB_MINMAX_ITEM(-3443, -2560),
+ 15, 16, TLV_DB_MINMAX_ITEM(-2475, -2324),
+ 17, 19, TLV_DB_MINMAX_ITEM(-2228, -2031),
+ 20, 26, TLV_DB_MINMAX_ITEM(-1910, -1393),
+ 27, 31, TLV_DB_MINMAX_ITEM(-1322, -1032),
+ 32, 40, TLV_DB_MINMAX_ITEM(-968, -490),
+ 41, 50, TLV_DB_MINMAX_ITEM(-441, 0),
+ );
+
+ usb_audio_info(mixer->chip, "applying DragonFly dB scale quirk\n");
+ kctl->tlv.p = scale;
+ kctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_TLV_READ;
+ kctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK;
+}
+
+void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer,
+ struct usb_mixer_elem_info *cval, int unitid,
+ struct snd_kcontrol *kctl)
+{
+ switch (mixer->chip->usb_id) {
+ case USB_ID(0x21b4, 0x0081): /* AudioQuest DragonFly */
+ if (unitid == 7 && cval->min == 0 && cval->max == 50)
+ snd_dragonfly_quirk_db_scale(mixer, kctl);
+ break;
+ }
+}
+
void snd_usb_mixer_rc_memory_change(struct usb_mixer_interface *mixer,
int unitid);
+void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer,
+ struct usb_mixer_elem_info *cval, int unitid,
+ struct snd_kcontrol *kctl);
+
#endif /* SND_USB_MIXER_QUIRKS_H */
case USB_ID(0x045E, 0x0779): /* MS Lifecam HD-3000 */
case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
+ case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
return true;
}
return false;
#
# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-grace=120
-
T=/tmp/kvm-test-1-run.sh.$$
trap 'rm -rf $T' 0
touch $T
qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
# Generate architecture-specific and interaction-specific qemu arguments
-qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`"
+qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
# Generate qemu -append arguments
qemu_append="`identify_qemu_append "$QEMU"`"
touch $resdir/buildonly
exit 0
fi
-echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log
+echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
qemu_pid=$!
else
break
fi
- if test $kruntime -ge $((seconds + grace))
+ if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE))
then
echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1
kill -KILL $qemu_pid
done
fi
-cp $builddir/console.log $resdir
parse-torture.sh $resdir/console.log $title
parse-console.sh $resdir/console.log $title
TORTURE_BOOT_IMAGE=""
TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
TORTURE_KMAKE_ARG=""
+TORTURE_SHUTDOWN_GRACE=180
TORTURE_SUITE=rcu
resdir=""
configs=""
resdir=$2
shift
;;
+ --shutdown-grace)
+ checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error'
+ TORTURE_SHUTDOWN_GRACE=$2
+ shift
+ ;;
--torture)
checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
TORTURE_SUITE=$2
TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
+TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE
TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE
if ! test -e $resdir
then
}
# Dump out the scripting required to run one test batch.
-function dump(first, pastlast)
+function dump(first, pastlast, batchnum)
{
- print "echo ----Start batch: `date`";
- print "echo ----Start batch: `date` >> " rd "/log";
+ print "echo ----Start batch " batchnum ": `date`";
+ print "echo ----Start batch " batchnum ": `date` >> " rd "/log";
jn=1
for (j = first; j < pastlast; j++) {
builddir=KVM "/b" jn
njobs = i;
nc = ncpus;
first = 0;
+ batchnum = 1;
# Each pass through the following loop considers one test.
for (i = 0; i < njobs; i++) {
if (ncpus == 0) {
# Sequential test specified, each test its own batch.
- dump(i, i + 1);
+ dump(i, i + 1, batchnum);
first = i;
+ batchnum++;
} else if (nc < cpus[i] && i != 0) {
# Out of CPUs, dump out a batch.
- dump(first, i);
+ dump(first, i, batchnum);
first = i;
nc = ncpus;
+ batchnum++;
}
# Account for the CPUs needed by the current test.
nc -= cpus[i];
}
# Dump the last batch.
if (ncpus != 0)
- dump(first, i);
+ dump(first, i, batchnum);
}' >> $T/script
cat << ___EOF___ >> $T/script
#
# Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-T=/tmp/abat-chk-badness.sh.$$
-trap 'rm -f $T' 0
-
file="$1"
title="$2"
then
print_warning Console output contains nul bytes, old qemu still running?
fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T
-if test -s $T
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
+if test -s $1.diags
then
print_warning Assertion failure in $file $title
- cat $T
+ # cat $1.diags
+ summary=""
+ n_badness=`grep -c Badness $1`
+ if test "$n_badness" -ne 0
+ then
+ summary="$summary Badness: $n_badness"
+ fi
+ n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
+ if test "$n_warn" -ne 0
+ then
+ summary="$summary Warnings: $n_warn"
+ fi
+ n_bugs=`egrep -c 'BUG|Oops:' $1`
+ if test "$n_bugs" -ne 0
+ then
+ summary="$summary Bugs: $n_bugs"
+ fi
+ n_calltrace=`grep -c 'Call Trace:' $1`
+ if test "$n_calltrace" -ne 0
+ then
+ summary="$summary Call Traces: $n_calltrace"
+ fi
+ n_lockdep=`grep -c =========== $1`
+ if test "$n_badness" -ne 0
+ then
+ summary="$summary lockdep: $n_badness"
+ fi
+ n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
+ if test "$n_stalls" -ne 0
+ then
+ summary="$summary Stalls: $n_stalls"
+ fi
+ print_warning Summary: $summary
fi
CONFIG_NO_HZ_FULL_SYSIDLE
CONFIG_RCU_NOCB_CPU
-CONFIG_RCU_USER_QS
Meaningless for TINY_RCU.
Always used in KVM testing.
-CONFIG_RCU_USER_QS
-
- Redundant with CONFIG_NO_HZ_FULL.
-
CONFIG_PREEMPT_RCU
CONFIG_TREE_RCU
return true;
}
- return dist_active_irq(vcpu);
+ return vgic_irq_is_active(vcpu, map->virt_irq);
}
/*